From 65eb44327cf32a83dbbf13eb70f9d8c03f3efaef Mon Sep 17 00:00:00 2001 From: Christudasan Devadasan Date: Thu, 20 Jun 2024 10:02:29 +0000 Subject: [AMDGPU][SILoadStoreOptimizer] Merge constrained sloads Consider the constrained multi-dword loads while merging individual loads to a single multi-dword load. --- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 1 + llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 79 ++- .../CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 169 +++--- .../CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 12 +- .../GlobalISel/llvm.amdgcn.global.atomic.csub.ll | 12 +- .../AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll | 102 ++-- .../AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll | 100 ++-- .../AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll | 84 +-- llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 204 +++---- llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 130 ++-- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 42 +- .../CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll | 408 ++++++------- llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 48 +- llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 40 +- llvm/test/CodeGen/AMDGPU/bfm.ll | 10 +- llvm/test/CodeGen/AMDGPU/bitreverse.ll | 314 +++++----- llvm/test/CodeGen/AMDGPU/build_vector.ll | 50 +- llvm/test/CodeGen/AMDGPU/calling-conventions.ll | 50 +- llvm/test/CodeGen/AMDGPU/cluster_stores.ll | 120 ++-- llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll | 20 +- llvm/test/CodeGen/AMDGPU/ctlz.ll | 454 +++++++------- llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 446 +++++++------- llvm/test/CodeGen/AMDGPU/ctpop16.ll | 100 ++-- llvm/test/CodeGen/AMDGPU/ctpop64.ll | 48 +- llvm/test/CodeGen/AMDGPU/cttz.ll | 370 ++++++------ llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 424 ++++++------- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 665 ++++++++++----------- .../AMDGPU/divergence-driven-buildvector.ll | 48 +- llvm/test/CodeGen/AMDGPU/ds_read2.ll | 16 +- llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll | 50 +- llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/fabs.ll | 34 +- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 6 +- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 63 +- llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 66 +- llvm/test/CodeGen/AMDGPU/fdiv.ll | 286 ++++----- llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 40 +- .../test/CodeGen/AMDGPU/flat_atomics_i32_system.ll | 112 ++-- llvm/test/CodeGen/AMDGPU/fma-combine.ll | 176 +++--- .../CodeGen/AMDGPU/fmul-2-combine-multi-use.ll | 18 +- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 480 +++++++-------- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 18 +- llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 46 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 34 +- llvm/test/CodeGen/AMDGPU/fneg.ll | 26 +- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 52 +- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 44 +- llvm/test/CodeGen/AMDGPU/fp-classify.ll | 26 +- .../CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll | 66 +- .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll | 92 +-- llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll | 40 +- llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll | 40 +- llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll | 40 +- .../CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll | 48 +- .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll | 76 +-- llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 38 +- llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 38 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 88 +-- llvm/test/CodeGen/AMDGPU/fshr.ll | 40 +- llvm/test/CodeGen/AMDGPU/global_atomics.ll | 48 +- .../CodeGen/AMDGPU/global_atomics_i32_system.ll | 196 +++--- llvm/test/CodeGen/AMDGPU/half.ll | 82 +-- llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | 176 +++--- .../AMDGPU/insert_waitcnt_for_precise_memory.ll | 248 ++++---- llvm/test/CodeGen/AMDGPU/kernel-args.ll | 106 ++-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 30 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 64 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 100 ++-- .../AMDGPU/llvm.amdgcn.global.atomic.csub.ll | 8 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 32 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 50 +- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll | 16 +- .../CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll | 72 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 392 ++++++------ .../CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll | 168 +++--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll | 36 +- .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 132 ++-- .../llvm.amdgcn.sched.group.barrier.gfx11.ll | 28 +- .../llvm.amdgcn.sched.group.barrier.gfx12.ll | 28 +- .../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 244 ++++---- .../CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll | 80 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 82 +-- llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 116 ++-- llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 116 ++-- llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 26 +- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 36 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 36 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 64 +- .../CodeGen/AMDGPU/llvm.r600.read.local.size.ll | 8 +- llvm/test/CodeGen/AMDGPU/llvm.round.ll | 60 +- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 42 +- llvm/test/CodeGen/AMDGPU/madak.ll | 184 +++--- llvm/test/CodeGen/AMDGPU/memory_clause.ll | 36 +- llvm/test/CodeGen/AMDGPU/merge-s-load.mir | 180 ++++-- llvm/test/CodeGen/AMDGPU/min.ll | 54 +- llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll | 160 ++--- llvm/test/CodeGen/AMDGPU/mul.ll | 29 +- llvm/test/CodeGen/AMDGPU/mul_int24.ll | 76 +-- llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 110 ++-- llvm/test/CodeGen/AMDGPU/or.ll | 16 +- llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 72 +-- llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 48 +- .../CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll | 48 +- llvm/test/CodeGen/AMDGPU/rotl.ll | 26 +- llvm/test/CodeGen/AMDGPU/rotr.ll | 22 +- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 72 +-- llvm/test/CodeGen/AMDGPU/sign_extend.ll | 54 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 8 +- llvm/test/CodeGen/AMDGPU/sub.ll | 34 +- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 72 +-- llvm/test/CodeGen/AMDGPU/udiv.ll | 34 +- llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 30 +- llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 112 ++-- llvm/test/CodeGen/AMDGPU/wave32.ll | 54 +- llvm/test/CodeGen/AMDGPU/xor.ll | 10 +- 116 files changed, 5687 insertions(+), 5549 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index db5b467..19d5b950 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -967,6 +967,7 @@ public: bool hasLDSFPAtomicAddF32() const { return GFX8Insts; } bool hasLDSFPAtomicAddF64() const { return GFX90AInsts; } + bool hasXnackReplay() const { return GFX8Insts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 8b42d4a..0b285d5 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -216,7 +216,8 @@ private: CombineInfo &Paired, bool Modify = false); static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI, const CombineInfo &Paired); - static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired); + static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired, + const GCNSubtarget *STI = nullptr); static std::pair getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired); const TargetRegisterClass * @@ -343,6 +344,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORD_IMM: case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM: case AMDGPU::S_LOAD_DWORD_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORD_SADDR: case AMDGPU::GLOBAL_STORE_DWORD: @@ -353,6 +355,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX2_IMM: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX2: case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX2: @@ -363,6 +366,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX3_IMM: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX3: case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX3: @@ -373,6 +377,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: case AMDGPU::GLOBAL_LOAD_DWORDX4: case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR: case AMDGPU::GLOBAL_STORE_DWORDX4: @@ -383,6 +388,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) { case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return 8; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -507,6 +513,11 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return S_LOAD_IMM; case AMDGPU::DS_READ_B32: case AMDGPU::DS_READ_B32_gfx9: @@ -591,6 +602,11 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: return AMDGPU::S_LOAD_DWORD_IMM; case AMDGPU::GLOBAL_LOAD_DWORD: case AMDGPU::GLOBAL_LOAD_DWORDX2: @@ -703,6 +719,11 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) { case AMDGPU::S_LOAD_DWORDX3_IMM: case AMDGPU::S_LOAD_DWORDX4_IMM: case AMDGPU::S_LOAD_DWORDX8_IMM: + case AMDGPU::S_LOAD_DWORD_IMM_ec: + case AMDGPU::S_LOAD_DWORDX2_IMM_ec: + case AMDGPU::S_LOAD_DWORDX3_IMM_ec: + case AMDGPU::S_LOAD_DWORDX4_IMM_ec: + case AMDGPU::S_LOAD_DWORDX8_IMM_ec: Result.SBase = true; return Result; case AMDGPU::DS_READ_B32: @@ -1212,8 +1233,17 @@ void SILoadStoreOptimizer::copyToDestRegs( // Copy to the old destination registers. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); - const auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); - const auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); + auto *Dest0 = TII->getNamedOperand(*CI.I, OpName); + auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName); + + // The constrained sload instructions in S_LOAD_IMM class will have + // `early-clobber` flag in the dst operand. Remove the flag before using the + // MOs in copies. + if (Dest0->isEarlyClobber()) + Dest0->setIsEarlyClobber(false); + + if (Dest1->isEarlyClobber()) + Dest1->setIsEarlyClobber(false); BuildMI(*MBB, InsertBefore, DL, CopyDesc) .add(*Dest0) // Copy to same destination including flags and sub reg. @@ -1446,7 +1476,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair( MachineBasicBlock::iterator InsertBefore) { MachineBasicBlock *MBB = CI.I->getParent(); DebugLoc DL = CI.I->getDebugLoc(); - const unsigned Opcode = getNewOpcode(CI, Paired); + const unsigned Opcode = getNewOpcode(CI, Paired, STM); const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired); @@ -1658,7 +1688,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair( } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, - const CombineInfo &Paired) { + const CombineInfo &Paired, + const GCNSubtarget *STI) { const unsigned Width = CI.Width + Paired.Width; switch (getCommonInstClass(CI, Paired)) { @@ -1701,17 +1732,33 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI, return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM; } case S_LOAD_IMM: - switch (Width) { - default: - return 0; - case 2: - return AMDGPU::S_LOAD_DWORDX2_IMM; - case 3: - return AMDGPU::S_LOAD_DWORDX3_IMM; - case 4: - return AMDGPU::S_LOAD_DWORDX4_IMM; - case 8: - return AMDGPU::S_LOAD_DWORDX8_IMM; + // For targets that support XNACK replay, use the constrained load opcode. + if (STI && STI->hasXnackReplay()) { + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM_ec; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM_ec; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM_ec; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM_ec; + } + } else { + switch (Width) { + default: + return 0; + case 2: + return AMDGPU::S_LOAD_DWORDX2_IMM; + case 3: + return AMDGPU::S_LOAD_DWORDX3_IMM; + case 4: + return AMDGPU::S_LOAD_DWORDX4_IMM; + case 8: + return AMDGPU::S_LOAD_DWORDX8_IMM; + } } case GLOBAL_LOAD: switch (Width) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index eb20178..3f034ea 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -468,18 +468,18 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -513,16 +513,16 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v1, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 @@ -562,16 +562,16 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 @@ -612,16 +612,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v3, v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 @@ -679,11 +679,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -706,12 +706,12 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -770,6 +770,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: v_mov_b32_e32 v7, 8 @@ -779,11 +780,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v1, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0xff -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v1 ; VI-NEXT: v_and_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -804,8 +803,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_lshlrev_b32_e32 v2, 24, v6 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; VI-NEXT: v_or_b32_e32 v2, v0, v2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid.x = call i32 @llvm.amdgcn.workitem.id.x() @@ -858,11 +857,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -884,10 +883,10 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: flat_load_ubyte v4, v[8:9] ; VI-NEXT: flat_load_ubyte v5, v[10:11] ; VI-NEXT: flat_load_ubyte v6, v[12:13] -; VI-NEXT: v_mov_b32_e32 v8, s1 -; VI-NEXT: v_mov_b32_e32 v7, s0 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v8, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v7, s4 ; VI-NEXT: v_mov_b32_e32 v10, s1 ; VI-NEXT: v_mov_b32_e32 v9, s0 ; VI-NEXT: s_waitcnt vmcnt(6) @@ -949,18 +948,18 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] -; VI-NEXT: v_mov_b32_e32 v9, s1 -; VI-NEXT: v_mov_b32_e32 v8, s0 -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s4, 16 +; VI-NEXT: v_mov_b32_e32 v9, s5 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: v_mov_b32_e32 v11, s1 ; VI-NEXT: v_mov_b32_e32 v10, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1005,19 +1004,19 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1051,18 +1050,18 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1096,18 +1095,18 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1157,11 +1156,11 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v0 @@ -1184,12 +1183,12 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_or_b32_e32 v1, v2, v3 ; VI-NEXT: v_or_b32_e32 v3, v1, v0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v3 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1221,18 +1220,18 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1265,18 +1264,18 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1310,18 +1309,18 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1354,18 +1353,18 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index fff341b..a018ea5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) @@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: v_mov_b32_e32 v0, s3 +; GFX940-NEXT: v_mov_b32_e32 v1, s2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_bf16 v1, v0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll index ade6e55..dbe7d47 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll @@ -160,10 +160,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[4:5] offset:4096 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v0, off ; GFX12-NEXT: s_nop 0 @@ -199,10 +199,10 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa ; ; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[4:5] offset:4096 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024 %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll index 6e96a4d..87d0d71 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -628,7 +628,7 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 @@ -637,10 +637,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030-NEXT: v_mov_b32_e32 v2, s2 -; GFX1030-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; GFX1030-NEXT: v_mov_b32_e32 v3, s7 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -651,24 +651,24 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 ; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_mov_b32_e32 v2, s2 -; GFX1013-NEXT: v_mov_b32_e32 v3, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s4 +; GFX1013-NEXT: v_mov_b32_e32 v1, s5 +; GFX1013-NEXT: v_mov_b32_e32 v2, s6 +; GFX1013-NEXT: v_mov_b32_e32 v3, s7 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 @@ -681,42 +681,42 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s8, 0x40400000 +; GFX11-NEXT: s_mov_b32 s2, 2.0 +; GFX11-NEXT: s_mov_b32 s1, 1.0 +; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: s_mov_b32 s10, 0x40a00000 -; GFX11-NEXT: s_mov_b32 s9, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 ; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: s_mov_b32 s1, 1.0 -; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_mov_b32 s4, 0x40400000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: s_mov_b32 s6, 0x40a00000 +; GFX11-NEXT: s_mov_b32 s5, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v4, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[4:7] +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[8:11] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm @@ -742,16 +742,16 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s0 -; GFX1030-NEXT: v_mov_b32_e32 v1, s1 -; GFX1030-NEXT: v_mov_b32_e32 v2, s2 -; GFX1030-NEXT: v_mov_b32_e32 v3, s3 +; GFX1030-NEXT: v_mov_b32_e32 v0, s4 +; GFX1030-NEXT: v_mov_b32_e32 v1, s5 +; GFX1030-NEXT: v_mov_b32_e32 v2, s6 +; GFX1030-NEXT: v_mov_b32_e32 v3, s7 ; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 @@ -762,21 +762,21 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_mov_b32_e32 v0, s0 -; GFX1013-NEXT: v_mov_b32_e32 v1, s1 -; GFX1013-NEXT: v_mov_b32_e32 v2, s2 -; GFX1013-NEXT: v_mov_b32_e32 v3, s3 +; GFX1013-NEXT: v_mov_b32_e32 v0, s4 +; GFX1013-NEXT: v_mov_b32_e32 v1, s5 +; GFX1013-NEXT: v_mov_b32_e32 v2, s6 +; GFX1013-NEXT: v_mov_b32_e32 v3, s7 ; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 ; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo ; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 @@ -789,37 +789,37 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s8, 0x42004600 -; GFX11-NEXT: s_mov_b32 s9, 0x44004700 -; GFX11-NEXT: s_mov_b32 s10, 0x45004800 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: s_mov_b32 s2, 2.0 ; GFX11-NEXT: s_mov_b32 s1, 1.0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: s_mov_b32 s4, 0x42004600 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX11-NEXT: s_mov_b32 s5, 0x44004700 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: s_mov_b32 s2, 2.0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s8 +; GFX11-NEXT: s_mov_b32 s6, 0x45004800 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s4 ; GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v4, s9 +; GFX11-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v4, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[4:7] a16 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[8:11] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 0c60be9..fa24489 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -180,21 +180,21 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 1 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s0, 1 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 store <2 x i32> %tmp, ptr addrspace(1) %out @@ -204,21 +204,21 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 1.0 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s0, 1.0 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 store <2 x float> %tmp, ptr addrspace(1) %out @@ -248,21 +248,21 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x10001 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s0, 0x10001 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 store <4 x i16> %tmp, ptr addrspace(1) %out @@ -272,21 +272,21 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s0, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 store <4 x half> %tmp, ptr addrspace(1) %out @@ -296,21 +296,21 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s0, 0x3f803f80 +; GCN-NEXT: s_mov_b32 s1, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, v2 ; GCN-NEXT: v_mov_b32_e32 v1, v3 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 store <4 x bfloat> %tmp, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll index 727184a..d628270 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -6,36 +6,36 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) { ; GFX8-LABEL: dpp_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: dpp_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -156,47 +156,47 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1 define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> %in1, <2 x i32> %in2) { ; GFX8-LABEL: update_dppv2i32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: update_dppv2i32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2i32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -211,47 +211,47 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32> define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x float> %in1, <2 x float> %in2) { ; GFX8-LABEL: update_dppv2f32_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v5, s3 -; GFX8-NEXT: v_mov_b32_e32 v4, s2 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: update_dppv2f32_test: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1] -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5] +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: update_dppv2f32_test: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 -; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 +; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 -; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1] +; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll index b666f45..c3bd566 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -616,26 +616,26 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: sdivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i32 s2, s10, 31 -; GFX8-NEXT: s_add_i32 s0, s10, s2 +; GFX8-NEXT: s_ashr_i32 s2, s14, 31 +; GFX8-NEXT: s_add_i32 s0, s14, s2 ; GFX8-NEXT: s_xor_b32 s3, s0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX8-NEXT: s_ashr_i32 s10, s11, 31 -; GFX8-NEXT: s_add_i32 s0, s11, s10 -; GFX8-NEXT: s_xor_b32 s11, s0, s10 +; GFX8-NEXT: s_ashr_i32 s4, s15, 31 +; GFX8-NEXT: s_add_i32 s0, s15, s4 +; GFX8-NEXT: s_xor_b32 s5, s0, s4 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: s_sub_i32 s0, 0, s3 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX8-NEXT: s_ashr_i32 s12, s8, 31 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s5 +; GFX8-NEXT: s_ashr_i32 s6, s12, 31 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX8-NEXT: s_sub_i32 s1, 0, s11 +; GFX8-NEXT: s_sub_i32 s1, 0, s5 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX8-NEXT: s_add_i32 s0, s8, s12 -; GFX8-NEXT: s_xor_b32 s0, s0, s12 +; GFX8-NEXT: s_add_i32 s0, s12, s6 +; GFX8-NEXT: s_xor_b32 s0, s0, s6 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 @@ -654,159 +654,159 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 -; GFX8-NEXT: s_xor_b32 s0, s12, s2 -; GFX8-NEXT: s_ashr_i32 s2, s9, 31 -; GFX8-NEXT: s_add_i32 s1, s9, s2 +; GFX8-NEXT: s_xor_b32 s0, s6, s2 +; GFX8-NEXT: s_ashr_i32 s2, s13, 31 +; GFX8-NEXT: s_add_i32 s1, s13, s2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 ; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, s12, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, s6, v3 ; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mul_lo_u32 v3, v1, s11 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s12, v2 +; GFX8-NEXT: v_mul_lo_u32 v3, v1, s5 +; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s6, v2 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s1, v3 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 -; GFX8-NEXT: s_xor_b32 s0, s2, s10 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s5, v3 +; GFX8-NEXT: s_xor_b32 s0, s2, s4 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sdivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s8, s6, 31 -; GFX9-NEXT: s_add_i32 s6, s6, s8 -; GFX9-NEXT: s_xor_b32 s6, s6, s8 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 -; GFX9-NEXT: s_ashr_i32 s9, s7, 31 -; GFX9-NEXT: s_add_i32 s7, s7, s9 -; GFX9-NEXT: s_xor_b32 s7, s7, s9 +; GFX9-NEXT: s_ashr_i32 s0, s14, 31 +; GFX9-NEXT: s_add_i32 s1, s14, s0 +; GFX9-NEXT: s_xor_b32 s1, s1, s0 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 +; GFX9-NEXT: s_ashr_i32 s2, s15, 31 +; GFX9-NEXT: s_add_i32 s3, s15, s2 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 -; GFX9-NEXT: s_sub_i32 s12, 0, s6 -; GFX9-NEXT: s_ashr_i32 s10, s4, 31 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 +; GFX9-NEXT: s_sub_i32 s6, 0, s1 +; GFX9-NEXT: s_ashr_i32 s4, s12, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 -; GFX9-NEXT: s_add_i32 s4, s4, s10 -; GFX9-NEXT: s_xor_b32 s4, s4, s10 -; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 +; GFX9-NEXT: s_sub_i32 s7, 0, s3 +; GFX9-NEXT: s_ashr_i32 s5, s13, 31 +; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 ; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX9-NEXT: s_sub_i32 s12, 0, s7 +; GFX9-NEXT: s_add_i32 s6, s12, s4 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 -; GFX9-NEXT: s_ashr_i32 s11, s5, 31 -; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 -; GFX9-NEXT: s_add_i32 s5, s5, s11 +; GFX9-NEXT: s_xor_b32 s6, s6, s4 +; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 +; GFX9-NEXT: s_add_i32 s7, s13, s5 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 ; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 -; GFX9-NEXT: s_xor_b32 s5, s5, s11 -; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 +; GFX9-NEXT: s_xor_b32 s7, s7, s5 +; GFX9-NEXT: s_xor_b32 s0, s4, s0 +; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 -; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: s_xor_b32 s4, s10, s8 -; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 -; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 +; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 -; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3 -; GFX9-NEXT: s_xor_b32 s4, s11, s9 +; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3 +; GFX9-NEXT: s_xor_b32 s0, s5, s2 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 -; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3 +; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 +; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 +; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2 -; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3] +; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 +; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: sdivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ashr_i32 s1, s10, 31 -; GFX10-NEXT: s_ashr_i32 s2, s11, 31 -; GFX10-NEXT: s_add_i32 s0, s10, s1 -; GFX10-NEXT: s_add_i32 s3, s11, s2 -; GFX10-NEXT: s_xor_b32 s10, s0, s1 +; GFX10-NEXT: s_ashr_i32 s1, s14, 31 +; GFX10-NEXT: s_ashr_i32 s2, s15, 31 +; GFX10-NEXT: s_add_i32 s0, s14, s1 +; GFX10-NEXT: s_add_i32 s3, s15, s2 +; GFX10-NEXT: s_xor_b32 s4, s0, s1 ; GFX10-NEXT: s_xor_b32 s3, s3, s2 -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4 ; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3 -; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s11, 0, s3 -; GFX10-NEXT: s_ashr_i32 s12, s9, 31 +; GFX10-NEXT: s_sub_i32 s0, 0, s4 +; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: s_ashr_i32 s6, s13, 31 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX10-NEXT: s_add_i32 s7, s13, s6 +; GFX10-NEXT: s_xor_b32 s7, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1 -; GFX10-NEXT: s_ashr_i32 s11, s8, 31 -; GFX10-NEXT: s_add_i32 s0, s8, s11 -; GFX10-NEXT: s_add_i32 s8, s9, s12 -; GFX10-NEXT: s_xor_b32 s0, s0, s11 -; GFX10-NEXT: s_xor_b32 s8, s8, s12 +; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1 +; GFX10-NEXT: s_ashr_i32 s5, s12, 31 +; GFX10-NEXT: s_add_i32 s0, s12, s5 +; GFX10-NEXT: s_xor_b32 s1, s5, s1 +; GFX10-NEXT: s_xor_b32 s0, s0, s5 ; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 -; GFX10-NEXT: s_xor_b32 s1, s11, s1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 ; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4 ; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 @@ -814,26 +814,26 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2 ; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: s_xor_b32 s0, s12, s2 +; GFX10-NEXT: s_xor_b32 s0, s6, s2 ; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0 ; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2 -; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3 +; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2 +; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3 -; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3 +; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = sdiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll index a58397e..63a0d8a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -522,11 +522,11 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) { ; GFX8-LABEL: udivrem_v2i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX8-NEXT: s_sub_i32 s0, 0, s10 +; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX8-NEXT: s_sub_i32 s0, 0, s14 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -534,54 +534,54 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 ; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GFX8-NEXT: v_mul_lo_u32 v2, s0, v0 -; GFX8-NEXT: s_sub_i32 s0, 0, s11 +; GFX8-NEXT: s_sub_i32 s0, 0, s15 ; GFX8-NEXT: v_mul_lo_u32 v3, s0, v1 ; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; GFX8-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX8-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 -; GFX8-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX8-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, s14 ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_mul_lo_u32 v4, v1, s11 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v2 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX8-NEXT: v_mul_lo_u32 v4, v1, s15 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s10, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s14, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_add_u32_e32 v3, vcc, 1, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s10, v2 +; GFX8-NEXT: v_subrev_u32_e64 v3, s[0:1], s14, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s9, v4 +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, s13, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s15, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s11, v3 +; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s15, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udivrem_v2i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX9-NEXT: s_sub_i32 s0, 0, s10 -; GFX9-NEXT: s_sub_i32 s1, 0, s11 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX9-NEXT: s_sub_i32 s0, 0, s14 +; GFX9-NEXT: s_sub_i32 s1, 0, s15 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -593,47 +593,47 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_mul_hi_u32 v0, s8, v0 +; GFX9-NEXT: v_mul_hi_u32 v0, s12, v0 ; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 -; GFX9-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, s10 +; GFX9-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX9-NEXT: v_mul_lo_u32 v2, v0, s14 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX9-NEXT: v_mul_lo_u32 v3, v1, s15 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 -; GFX9-NEXT: v_sub_u32_e32 v2, s8, v2 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 -; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 +; GFX9-NEXT: v_sub_u32_e32 v2, s12, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 +; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 -; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 +; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 +; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v3 ; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, s10, v2 +; GFX9-NEXT: v_subrev_u32_e32 v4, s14, v2 ; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 -; GFX9-NEXT: v_subrev_u32_e32 v4, s11, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v3 +; GFX9-NEXT: v_subrev_u32_e32 v4, s15, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] -; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9] +; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: udivrem_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10 -; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s11 -; GFX10-NEXT: s_sub_i32 s0, 0, s10 -; GFX10-NEXT: s_sub_i32 s1, 0, s11 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s14 +; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15 +; GFX10-NEXT: s_sub_i32 s0, 0, s14 +; GFX10-NEXT: s_sub_i32 s1, 0, s15 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 @@ -646,34 +646,34 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1 ; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3 ; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3 -; GFX10-NEXT: v_mul_hi_u32 v0, s8, v0 -; GFX10-NEXT: v_mul_hi_u32 v1, s9, v1 -; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10 -; GFX10-NEXT: v_mul_lo_u32 v3, v1, s11 +; GFX10-NEXT: v_mul_hi_u32 v0, s12, v0 +; GFX10-NEXT: v_mul_hi_u32 v1, s13, v1 +; GFX10-NEXT: v_mul_lo_u32 v2, v0, s14 +; GFX10-NEXT: v_mul_lo_u32 v3, v1, s15 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, s8, v2 -; GFX10-NEXT: v_sub_nc_u32_e32 v3, s9, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 +; GFX10-NEXT: v_sub_nc_u32_e32 v2, s12, v2 +; GFX10-NEXT: v_sub_nc_u32_e32 v3, s13, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo ; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 -; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2 -; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s11, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s11, v3 +; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s14, v2 +; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s15, v3 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[4:5] -; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[6:7] +; GFX10-NEXT: global_store_dwordx2 v8, v[0:1], s[8:9] +; GFX10-NEXT: global_store_dwordx2 v8, v[2:3], s[10:11] ; GFX10-NEXT: s_endpgm %div = udiv <2 x i32> %x, %y store <2 x i32> %div, ptr addrspace(1) %out0 diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 6f67ce4..be9b5b0 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -226,47 +226,47 @@ define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; VI-LABEL: s_test_add_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_add_i32 s2, s2, s3 -; VI-NEXT: s_add_i32 s4, s4, s5 -; VI-NEXT: s_and_b32 s2, s2, 0xffff -; VI-NEXT: s_lshl_b32 s3, s4, 16 -; VI-NEXT: s_or_b32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshr_b32 s0, s6, 16 +; VI-NEXT: s_lshr_b32 s1, s7, 16 +; VI-NEXT: s_add_i32 s2, s6, s7 +; VI-NEXT: s_add_i32 s0, s0, s1 +; VI-NEXT: s_and_b32 s1, s2, 0xffff +; VI-NEXT: s_lshl_b32 s0, s0, 16 +; VI-NEXT: s_or_b32 s0, s1, s0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_test_add_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_pk_add_u16 v1, s2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_pk_add_u16 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_add_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_pk_add_u16 v1, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_add_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v1, s2, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_pk_add_u16 v1, s6, s7 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 8144fb7..559871d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -72,31 +72,31 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -167,29 +167,29 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: urem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s4, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s4, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: s_sub_i32 s4, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s4, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s0, s0, s7 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: s_sub_i32 s1, s0, s7 +; GFX9-NEXT: s_cmp_ge_u32 s0, s7 +; GFX9-NEXT: s_cselect_b32 s0, s1, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = urem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -280,37 +280,37 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: sdiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s4, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 -; GFX9-NEXT: s_sub_i32 s5, 0, s4 -; GFX9-NEXT: s_xor_b32 s3, s2, s3 -; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: s_abs_i32 s0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_xor_b32 s1, s6, s7 +; GFX9-NEXT: s_abs_i32 s2, s6 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_ashr_i32 s3, s3, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 31 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s6, s5, s4 +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s6, s3, s0 ; GFX9-NEXT: s_sub_i32 s2, s2, s6 -; GFX9-NEXT: s_add_i32 s7, s5, 1 -; GFX9-NEXT: s_sub_i32 s6, s2, s4 -; GFX9-NEXT: s_cmp_ge_u32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s5, s7, s5 +; GFX9-NEXT: s_add_i32 s7, s3, 1 +; GFX9-NEXT: s_sub_i32 s6, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s3, s7, s3 ; GFX9-NEXT: s_cselect_b32 s2, s6, s2 -; GFX9-NEXT: s_add_i32 s6, s5, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s4 -; GFX9-NEXT: s_cselect_b32 s2, s6, s5 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_add_i32 s6, s3, 1 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s3 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -394,34 +394,34 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; ; GFX9-LABEL: srem_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_abs_i32 s3, s3 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_abs_i32 s2, s2 +; GFX9-NEXT: s_abs_i32 s0, s7 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 31 +; GFX9-NEXT: s_abs_i32 s2, s6 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s3, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %r = srem i32 %x, %y store i32 %r, ptr addrspace(1) %out @@ -5482,13 +5482,13 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: udiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_i32 s3, s3, 12 -; GFX9-NEXT: s_lshr_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_add_i32 s0, s7, 12 +; GFX9-NEXT: s_lshr_b32 s0, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = udiv i32 %x, %shl.y @@ -5524,14 +5524,14 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: udiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b32 s3, s3, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_lshr_b32 s0, s6, 12 +; GFX9-NEXT: s_lshr_b32 s1, s7, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5570,18 +5570,18 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, < ; ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mul_hi_u32 s4, s3, 0x100101 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s3, s3, 1 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_lshr_b32 s2, s2, 12 -; GFX9-NEXT: s_lshr_b32 s3, s3, 11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_mul_hi_u32 s1, s7, 0x100101 +; GFX9-NEXT: s_sub_i32 s2, s7, s1 +; GFX9-NEXT: s_lshr_b32 s2, s2, 1 +; GFX9-NEXT: s_add_i32 s2, s2, s1 +; GFX9-NEXT: s_lshr_b32 s0, s6, 12 +; GFX9-NEXT: s_lshr_b32 s1, s2, 11 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = udiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -5875,14 +5875,14 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: urem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_add_i32 s3, s3, -1 -; GFX9-NEXT: s_and_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_add_i32 s0, s0, -1 +; GFX9-NEXT: s_and_b32 s0, s6, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = urem i32 %x, %shl.y @@ -5918,14 +5918,14 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: urem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xfff -; GFX9-NEXT: s_and_b32 s3, s3, 0xfff -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_and_b32 s0, s6, 0xfff +; GFX9-NEXT: s_and_b32 s1, s7, 0xfff +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = urem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6234,41 +6234,41 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: sdiv_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s6, 0, s3 -; GFX9-NEXT: s_ashr_i32 s5, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s5 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s2, s6, 31 +; GFX9-NEXT: s_add_i32 s3, s6, s2 +; GFX9-NEXT: s_sub_i32 s6, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s5 +; GFX9-NEXT: s_xor_b32 s3, s3, s2 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s7, v0 ; GFX9-NEXT: s_mul_i32 s6, s6, s7 ; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 ; GFX9-NEXT: s_add_i32 s7, s7, s6 -; GFX9-NEXT: s_mul_hi_u32 s6, s2, s7 -; GFX9-NEXT: s_mul_i32 s8, s6, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s8 +; GFX9-NEXT: s_mul_hi_u32 s6, s3, s7 +; GFX9-NEXT: s_mul_i32 s8, s6, s0 +; GFX9-NEXT: s_sub_i32 s3, s3, s8 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_sub_i32 s8, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 +; GFX9-NEXT: s_sub_i32 s8, s3, s0 +; GFX9-NEXT: s_cmp_ge_u32 s3, s0 ; GFX9-NEXT: s_cselect_b32 s6, s7, s6 -; GFX9-NEXT: s_cselect_b32 s2, s8, s2 +; GFX9-NEXT: s_cselect_b32 s3, s8, s3 ; GFX9-NEXT: s_add_i32 s7, s6, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s7, s6 -; GFX9-NEXT: s_xor_b32 s3, s5, s4 -; GFX9-NEXT: s_xor_b32 s2, s2, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_cmp_ge_u32 s3, s0 +; GFX9-NEXT: s_cselect_b32 s0, s7, s6 +; GFX9-NEXT: s_xor_b32 s1, s2, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = sdiv i32 %x, %shl.y @@ -6310,20 +6310,20 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: sdiv_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_add_i32 s3, s3, s5 -; GFX9-NEXT: s_ashr_i32 s2, s2, 12 -; GFX9-NEXT: s_ashr_i32 s3, s3, 12 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_lshr_b32 s1, s1, 20 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_add_i32 s1, s7, s1 +; GFX9-NEXT: s_ashr_i32 s0, s0, 12 +; GFX9-NEXT: s_ashr_i32 s1, s1, 12 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6365,21 +6365,21 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, ; ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_mul_hi_i32 s5, s3, 0x80080081 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_add_i32 s5, s5, s3 -; GFX9-NEXT: s_add_i32 s2, s2, s4 -; GFX9-NEXT: s_lshr_b32 s3, s5, 31 -; GFX9-NEXT: s_ashr_i32 s4, s5, 11 -; GFX9-NEXT: s_ashr_i32 s2, s2, 12 -; GFX9-NEXT: s_add_i32 s4, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_mul_hi_i32 s1, s7, 0x80080081 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_add_i32 s1, s1, s7 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_lshr_b32 s2, s1, 31 +; GFX9-NEXT: s_ashr_i32 s1, s1, 11 +; GFX9-NEXT: s_ashr_i32 s0, s0, 12 +; GFX9-NEXT: s_add_i32 s1, s1, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = sdiv <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out @@ -6746,38 +6746,38 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x ; ; GFX9-LABEL: srem_i32_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 -; GFX9-NEXT: s_ashr_i32 s4, s3, 31 -; GFX9-NEXT: s_add_i32 s3, s3, s4 -; GFX9-NEXT: s_xor_b32 s3, s3, s4 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s5, 0, s3 -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_add_i32 s2, s2, s4 +; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s7 +; GFX9-NEXT: s_ashr_i32 s1, s0, 31 +; GFX9-NEXT: s_add_i32 s0, s0, s1 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 +; GFX9-NEXT: s_ashr_i32 s1, s6, 31 +; GFX9-NEXT: s_add_i32 s2, s6, s1 +; GFX9-NEXT: s_sub_i32 s3, 0, s0 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 +; GFX9-NEXT: s_xor_b32 s2, s2, s1 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s6, v0 -; GFX9-NEXT: s_mul_i32 s5, s5, s6 -; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 -; GFX9-NEXT: s_add_i32 s6, s6, s5 -; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 -; GFX9-NEXT: s_mul_i32 s5, s5, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_xor_b32 s2, s2, s4 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: s_mul_i32 s3, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s3, s6, s3 +; GFX9-NEXT: s_add_i32 s6, s6, s3 +; GFX9-NEXT: s_mul_hi_u32 s3, s2, s6 +; GFX9-NEXT: s_mul_i32 s3, s3, s0 +; GFX9-NEXT: s_sub_i32 s2, s2, s3 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_sub_i32 s3, s2, s0 +; GFX9-NEXT: s_cmp_ge_u32 s2, s0 +; GFX9-NEXT: s_cselect_b32 s0, s3, s2 +; GFX9-NEXT: s_xor_b32 s0, s0, s1 +; GFX9-NEXT: s_sub_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm %shl.y = shl i32 4096, %y %r = srem i32 %x, %shl.y @@ -6821,22 +6821,22 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3 ; ; GFX9-LABEL: srem_v2i32_pow2k_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s4, s2, 31 -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_lshr_b32 s4, s4, 20 -; GFX9-NEXT: s_lshr_b32 s5, s5, 20 -; GFX9-NEXT: s_add_i32 s4, s2, s4 -; GFX9-NEXT: s_add_i32 s5, s3, s5 -; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s2, s2, s4 -; GFX9-NEXT: s_and_b32 s4, s5, 0xfffff000 -; GFX9-NEXT: s_sub_i32 s3, s3, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_ashr_i32 s0, s6, 31 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_lshr_b32 s0, s0, 20 +; GFX9-NEXT: s_lshr_b32 s1, s1, 20 +; GFX9-NEXT: s_add_i32 s0, s6, s0 +; GFX9-NEXT: s_add_i32 s1, s7, s1 +; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 +; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 +; GFX9-NEXT: s_sub_i32 s0, s6, s0 +; GFX9-NEXT: s_sub_i32 s1, s7, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %r = srem <2 x i32> %x, store <2 x i32> %r, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll index 2c69ae5..ad6009e 100644 --- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll @@ -9,19 +9,19 @@ declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_endpgm entry: @@ -33,19 +33,19 @@ entry: define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5 ; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 ; GFX12-GISEL-NEXT: s_endpgm entry: @@ -92,18 +92,18 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[4:5] offset:-16 th:TH_ATOMIC_RETURN ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:-16 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4 @@ -114,20 +114,20 @@ entry: define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" { ; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-SDAG: ; %bb.0: ; %entry -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[4:5] offset:-16 ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced: ; GFX12-GISEL: ; %bb.0: ; %entry -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[4:5] offset:-16 ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll index af4116b..1639ec6 100644 --- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll @@ -131,13 +131,13 @@ define amdgpu_kernel void @s_ubfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_ubfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_sub_i32 s0, 32, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s1, s2, s0 +; VI-NEXT: s_sub_i32 s0, 32, s7 +; VI-NEXT: s_lshl_b32 s1, s6, s0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: s_lshr_b32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -175,13 +175,13 @@ define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_ubfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_sub_i32 s0, 32, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s1, s2, s0 +; VI-NEXT: s_sub_i32 s0, 32, s7 +; VI-NEXT: s_lshl_b32 s1, s6, s0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: s_lshr_b32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -329,13 +329,13 @@ define amdgpu_kernel void @s_sbfe_sub_i32(ptr addrspace(1) %out, i32 %src, i32 % ; ; VI-LABEL: s_sbfe_sub_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_sub_i32 s0, 32, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s1, s2, s0 +; VI-NEXT: s_sub_i32 s0, 32, s7 +; VI-NEXT: s_lshl_b32 s1, s6, s0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: s_ashr_i32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s0 @@ -373,13 +373,13 @@ define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, i ; ; VI-LABEL: s_sbfe_sub_multi_use_shl_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: s_sub_i32 s0, 32, s3 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_lshl_b32 s1, s2, s0 +; VI-NEXT: s_sub_i32 s0, 32, s7 +; VI-NEXT: s_lshl_b32 s1, s6, s0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; VI-NEXT: s_ashr_i32 s0, s1, s0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll index f8bd44b..8b2f66b 100644 --- a/llvm/test/CodeGen/AMDGPU/bfm.ll +++ b/llvm/test/CodeGen/AMDGPU/bfm.ll @@ -18,12 +18,12 @@ define amdgpu_kernel void @s_bfm_pattern(ptr addrspace(1) %out, i32 %x, i32 %y) ; ; VI-LABEL: s_bfm_pattern: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfm_b32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_bfm_b32 s0, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %a = shl i32 1, %x diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll index 64555f1..49ec09d 100644 --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -117,64 +117,64 @@ define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i16: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: s_mov_b32 s10, s6 -; FLAT-NEXT: s_mov_b32 s11, s7 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_mov_b32 s10, s2 +; FLAT-NEXT: s_mov_b32 s11, s3 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_mov_b32 s8, s2 -; FLAT-NEXT: s_mov_b32 s9, s3 +; FLAT-NEXT: s_mov_b32 s8, s6 +; FLAT-NEXT: s_mov_b32 s9, s7 ; FLAT-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; FLAT-NEXT: s_mov_b32 s4, s0 -; FLAT-NEXT: s_mov_b32 s5, s1 +; FLAT-NEXT: s_mov_b32 s0, s4 +; FLAT-NEXT: s_mov_b32 s1, s5 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 ; FLAT-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0 +; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_i16: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GISEL-NEXT: flat_load_ushort v0, v[0:1] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm ; ; GFX11-FLAT-LABEL: v_brev_i16: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_mov_b32 s4, s2 -; GFX11-FLAT-NEXT: s_mov_b32 s5, s3 -; GFX11-FLAT-NEXT: buffer_load_u16 v0, off, s[4:7], 0 +; GFX11-FLAT-NEXT: s_mov_b32 s0, s6 +; GFX11-FLAT-NEXT: s_mov_b32 s1, s7 +; GFX11-FLAT-NEXT: buffer_load_u16 v0, off, s[0:3], 0 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1] +; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v1, v0, s[4:5] ; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_i16: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[6:7] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 -; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: global_store_d16_hi_b16 v0, v1, s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -278,61 +278,61 @@ define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s3 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s7 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dword v0, v[0:1] -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dword v0, v[0:1] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm ; ; GFX11-FLAT-LABEL: v_brev_i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 +; GFX11-FLAT-NEXT: global_load_b32 v0, v0, s[6:7] +; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -362,59 +362,59 @@ define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> ; ; FLAT-LABEL: s_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_mov_b32 s4, s0 -; FLAT-NEXT: s_mov_b32 s5, s1 -; FLAT-NEXT: s_brev_b32 s0, s3 -; FLAT-NEXT: s_brev_b32 s1, s2 -; FLAT-NEXT: v_mov_b32_e32 v0, s1 -; FLAT-NEXT: v_mov_b32_e32 v1, s0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: s_mov_b32 s0, s4 +; FLAT-NEXT: s_mov_b32 s1, s5 +; FLAT-NEXT: s_brev_b32 s4, s7 +; FLAT-NEXT: s_brev_b32 s5, s6 +; FLAT-NEXT: v_mov_b32_e32 v0, s5 +; FLAT-NEXT: v_mov_b32_e32 v1, s4 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b32 s2, s2 -; GISEL-NEXT: s_brev_b32 s3, s3 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: s_brev_b32 s0, s6 +; GISEL-NEXT: s_brev_b32 s1, s7 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v3, s5 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm ; ; GFX11-FLAT-LABEL: s_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b32 s2, s2 -; GFX11-FLAT-NEXT: s_brev_b32 s3, s3 +; GFX11-FLAT-NEXT: s_brev_b32 s0, s6 +; GFX11-FLAT-NEXT: s_brev_b32 s1, s7 ; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-FLAT-NEXT: s_mov_b32 s4, s0 -; GFX11-FLAT-NEXT: s_mov_b32 s5, s1 -; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FLAT-NEXT: s_mov_b32 s0, s4 +; GFX11-FLAT-NEXT: s_mov_b32 s1, s5 +; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b32 s2, s2 -; GFX11-GISEL-NEXT: s_brev_b32 s3, s3 +; GFX11-GISEL-NEXT: s_brev_b32 s0, s6 +; GFX11-GISEL-NEXT: s_brev_b32 s1, s7 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -446,33 +446,33 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i32: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s3 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s7 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_v2i32: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-NEXT: v_mov_b32_e32 v3, s5 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 @@ -481,31 +481,31 @@ define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX11-FLAT-LABEL: v_brev_v2i32: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 +; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v0, v0 -; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_v2i32: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -534,52 +534,52 @@ define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) # ; ; FLAT-LABEL: s_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; FLAT-NEXT: s_mov_b32 s3, 0xf000 +; FLAT-NEXT: s_mov_b32 s2, -1 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_mov_b32 s4, s0 -; FLAT-NEXT: s_mov_b32 s5, s1 -; FLAT-NEXT: s_brev_b64 s[0:1], s[2:3] -; FLAT-NEXT: v_mov_b32_e32 v0, s0 -; FLAT-NEXT: v_mov_b32_e32 v1, s1 -; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; FLAT-NEXT: s_mov_b32 s0, s4 +; FLAT-NEXT: s_mov_b32 s1, s5 +; FLAT-NEXT: s_brev_b64 s[4:5], s[6:7] +; FLAT-NEXT: v_mov_b32_e32 v0, s4 +; FLAT-NEXT: v_mov_b32_e32 v1, s5 +; FLAT-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: s_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s1 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 -; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: s_brev_b64 s[0:1], s[6:7] +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v2, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v3, s5 ; GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GISEL-NEXT: s_endpgm ; ; GFX11-FLAT-LABEL: s_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[2:3] -; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 -; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-FLAT-NEXT: s_brev_b64 s[0:1], s[6:7] +; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 +; GFX11-FLAT-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: s_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: s_brev_b64 s[2:3], s[2:3] +; GFX11-GISEL-NEXT: s_brev_b64 s[0:1], s[6:7] ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -611,33 +611,33 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; FLAT-LABEL: v_brev_i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s3 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s7 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 -; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 +; FLAT-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; GISEL-NEXT: v_mov_b32_e32 v4, s1 -; GISEL-NEXT: v_mov_b32_e32 v3, s0 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v4, s5 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 @@ -646,31 +646,31 @@ define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX11-FLAT-LABEL: v_brev_i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[2:3] -; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 +; GFX11-FLAT-NEXT: global_load_b64 v[0:1], v0, s[6:7] +; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 -; GFX11-FLAT-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 +; GFX11-FLAT-NEXT: buffer_store_b64 v[1:2], off, s[4:7], 0 ; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-GISEL-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v1, v1 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b64 v0, v[1:2], s[0:1] +; GFX11-GISEL-NEXT: global_store_b64 v0, v[1:2], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -793,74 +793,74 @@ define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrs ; ; FLAT-LABEL: v_brev_v2i64: ; FLAT: ; %bb.0: -; FLAT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: v_mov_b32_e32 v1, s3 -; FLAT-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; FLAT-NEXT: v_mov_b32_e32 v1, s7 +; FLAT-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; FLAT-NEXT: s_mov_b32 s3, 0xf000 -; FLAT-NEXT: s_mov_b32 s2, -1 +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: s_waitcnt vmcnt(0) ; FLAT-NEXT: v_bfrev_b32_e32 v4, v2 ; FLAT-NEXT: v_bfrev_b32_e32 v3, v3 ; FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; FLAT-NEXT: v_bfrev_b32_e32 v1, v1 -; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 +; FLAT-NEXT: buffer_store_dwordx4 v[1:4], off, s[4:7], 0 ; FLAT-NEXT: s_endpgm ; ; GISEL-LABEL: v_brev_v2i64: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 +; GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GISEL-NEXT: v_mov_b32_e32 v1, s7 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 ; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 ; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 -; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GISEL-NEXT: s_endpgm ; ; GFX11-FLAT-LABEL: v_brev_v2i64: ; GFX11-FLAT: ; %bb.0: -; GFX11-FLAT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[2:3] -; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-FLAT-NEXT: s_mov_b32 s2, -1 +; GFX11-FLAT-NEXT: global_load_b128 v[0:3], v0, s[6:7] +; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FLAT-NEXT: s_mov_b32 s6, -1 ; GFX11-FLAT-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v4, v2 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v3, v3 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v2, v0 ; GFX11-FLAT-NEXT: v_bfrev_b32_e32 v1, v1 -; GFX11-FLAT-NEXT: buffer_store_b128 v[1:4], off, s[0:3], 0 +; GFX11-FLAT-NEXT: buffer_store_b128 v[1:4], off, s[4:7], 0 ; GFX11-FLAT-NEXT: s_nop 0 ; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLAT-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_brev_v2i64: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-GISEL-NEXT: global_load_b128 v[0:3], v0, s[6:7] ; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v4, v1 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v5, v0 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v6, v3 ; GFX11-GISEL-NEXT: v_bfrev_b32_e32 v7, v2 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-GISEL-NEXT: global_store_b128 v0, v[4:7], s[0:1] +; GFX11-GISEL-NEXT: global_store_b128 v0, v[4:7], s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll index b26d15e..8d347ae 100644 --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -277,53 +277,53 @@ define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, ; ; GFX8-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s3, s3, 16 -; GFX8-NEXT: s_lshl_b32 s2, s2, 16 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_lshl_b32 s0, s7, 16 +; GFX8-NEXT: s_lshl_b32 s1, s6, 16 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s2, s2, 16 -; GFX10-NEXT: s_lshl_b32 s3, s3, 16 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: s_lshl_b32 s0, s6, 16 +; GFX10-NEXT: s_lshl_b32 s1, s7, 16 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s2, s2, 16 -; GFX11-NEXT: s_lshl_b32 s3, s3, 16 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_lshl_b32 s0, s6, 16 +; GFX11-NEXT: s_lshl_b32 s1, s7, 16 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX940-LABEL: build_v2i32_from_v4i16_shuffle: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: s_lshl_b32 s3, s3, 16 -; GFX940-NEXT: s_lshl_b32 s2, s2, 16 -; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NEXT: s_lshl_b32 s0, s7, 16 +; GFX940-NEXT: s_lshl_b32 s1, s6, 16 +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NEXT: s_endpgm entry: %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index 15ebdd7..00af922 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -18,20 +18,20 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) { ; ; VI-LABEL: kernel: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: kernel: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_store_b32 v0, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v0, v0, s[2:3] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1245,26 +1245,26 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; VI-LABEL: amd_kernel_v5i8: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s2, s0, 24 -; VI-NEXT: s_lshr_b32 s3, s0, 16 -; VI-NEXT: s_add_i32 s3, s3, s3 -; VI-NEXT: s_add_i32 s2, s2, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_lshr_b32 s0, s2, 24 +; VI-NEXT: s_lshr_b32 s1, s2, 16 +; VI-NEXT: s_add_i32 s1, s1, s1 ; VI-NEXT: s_add_i32 s0, s0, s0 -; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_add_i32 s2, s2, s2 +; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: s_and_b32 s1, s1, 0xff +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_and_b32 s3, s3, 0xff ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; VI-NEXT: s_add_i32 s1, s1, s1 +; VI-NEXT: s_add_i32 s3, s3, s3 ; VI-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: v_mov_b32_e32 v0, 4 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_store_byte v[0:1], v5 @@ -1273,16 +1273,16 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) { ; ; GFX11-LABEL: amd_kernel_v5i8: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0 -; GFX11-NEXT: s_lshr_b32 s2, s0, 16 -; GFX11-NEXT: s_lshr_b32 s3, s0, 24 -; GFX11-NEXT: v_add_nc_u16 v1, s0, s0 -; GFX11-NEXT: v_add_nc_u16 v2, s3, s3 +; GFX11-NEXT: v_lshrrev_b16 v0, 8, s2 +; GFX11-NEXT: s_lshr_b32 s0, s2, 16 +; GFX11-NEXT: s_lshr_b32 s1, s2, 24 +; GFX11-NEXT: v_add_nc_u16 v1, s2, s2 +; GFX11-NEXT: v_add_nc_u16 v2, s1, s1 ; GFX11-NEXT: v_add_nc_u16 v0, v0, v0 -; GFX11-NEXT: v_add_nc_u16 v3, s2, s2 -; GFX11-NEXT: v_add_nc_u16 v6, s1, s1 +; GFX11-NEXT: v_add_nc_u16 v3, s0, s0 +; GFX11-NEXT: v_add_nc_u16 v6, s3, s3 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0 diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll index b6948da..9336816 100644 --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -30,16 +30,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dword v2, v[0:1] ; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 ; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: flat_store_dword v[0:1], v3 offset:8 @@ -49,20 +49,20 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX10-LABEL: cluster_load_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: s_add_u32 s0, s4, 8 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: s_add_u32 s2, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_add_u32 s0, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_clause 0x3 @@ -70,16 +70,16 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; GFX10-NEXT: flat_load_dword v9, v[2:3] ; GFX10-NEXT: flat_load_dword v10, v[4:5] ; GFX10-NEXT: flat_load_dword v11, v[6:7] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_add_u32 s0, s6, 8 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 16 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: s_add_u32 s2, s2, 24 -; GFX10-NEXT: s_addc_u32 s3, s3, 0 +; GFX10-NEXT: s_add_u32 s0, s6, 16 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 ; GFX10-NEXT: v_mov_b32_e32 v7, s3 @@ -96,15 +96,15 @@ define amdgpu_kernel void @cluster_load_cluster_store(ptr noalias %lb, ptr noali ; ; GFX11-LABEL: cluster_load_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) @@ -155,16 +155,16 @@ bb: define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr noalias %sb) { ; GFX9-LABEL: cluster_load_valu_cluster_store: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: flat_load_dword v2, v[0:1] ; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:8 ; GFX9-NEXT: flat_load_dword v4, v[0:1] offset:16 ; GFX9-NEXT: flat_load_dword v5, v[0:1] offset:24 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 @@ -175,20 +175,20 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX10-LABEL: cluster_load_valu_cluster_store: ; GFX10: ; %bb.0: ; %bb -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_add_u32 s4, s0, 8 -; GFX10-NEXT: s_addc_u32 s5, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s4 -; GFX10-NEXT: s_add_u32 s6, s0, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: s_addc_u32 s7, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_add_u32 s0, s0, 24 -; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v4, s6 -; GFX10-NEXT: v_mov_b32_e32 v5, s7 +; GFX10-NEXT: s_add_u32 s0, s4, 8 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: s_add_u32 s2, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_addc_u32 s3, s5, 0 +; GFX10-NEXT: s_add_u32 s0, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: flat_load_dword v6, v[2:3] ; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 @@ -196,18 +196,18 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; GFX10-NEXT: flat_load_dword v8, v[0:1] ; GFX10-NEXT: flat_load_dword v9, v[4:5] ; GFX10-NEXT: flat_load_dword v10, v[2:3] -; GFX10-NEXT: s_add_u32 s0, s2, 8 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: s_add_u32 s4, s2, 16 +; GFX10-NEXT: s_add_u32 s0, s6, 8 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: s_add_u32 s2, s6, 16 ; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: s_addc_u32 s5, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: s_addc_u32 s3, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 +; GFX10-NEXT: s_add_u32 s0, s6, 24 +; GFX10-NEXT: v_mov_b32_e32 v1, s7 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: s_addc_u32 s1, s7, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 ; GFX10-NEXT: v_mov_b32_e32 v7, s1 @@ -223,15 +223,15 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(ptr noalias %lb, ptr ; ; GFX11-LABEL: cluster_load_valu_cluster_store: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8 ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 33c0d90..9c7fa15 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -665,17 +665,17 @@ define amdgpu_kernel void @sub_zext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_zext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -714,17 +714,17 @@ define amdgpu_kernel void @sub_sext_setcc_commute(ptr addrspace(1) nocapture %ar ; ; GFX9-LABEL: sub_sext_setcc_commute: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: global_load_dword v3, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 -; GFX9-NEXT: global_store_dword v2, v0, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v0, s6, v0 +; GFX9-NEXT: v_subrev_u32_e32 v0, s7, v0 +; GFX9-NEXT: global_store_dword v2, v0, s[4:5] ; GFX9-NEXT: s_endpgm bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index 4decf39..332b601 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -127,19 +127,19 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32: @@ -164,41 +164,41 @@ define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -235,21 +235,21 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_v2i32: @@ -277,48 +277,48 @@ define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v2i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -359,15 +359,15 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctlz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v3 ; VI-NEXT: v_ffbh_u32_e32 v2, v2 @@ -377,7 +377,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_v4i32: @@ -411,11 +411,11 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_ctlz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v3, v3 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 @@ -425,16 +425,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -444,16 +444,16 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_v4i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3] +; GFX11-NEXT: global_load_b128 v[0:3], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v3, v3 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 @@ -465,7 +465,7 @@ define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -500,22 +500,22 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_ctlz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_subrev_u32_e32 v0, vcc, 24, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i8: @@ -550,42 +550,42 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_ctlz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 -; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b8 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -706,16 +706,16 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_flbit_i32_b64 s0, s[2:3] -; VI-NEXT: s_min_u32 s0, s0, 64 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_flbit_i32_b64 s4, s[6:7] +; VI-NEXT: s_min_u32 s4, s4, 64 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_ctlz_i64_trunc: @@ -737,35 +737,35 @@ define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX10-NEXT: s_min_u32 s2, s2, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX10-NEXT: s_min_u32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: s_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3] +; GFX11-NEXT: s_clz_i32_u64 s0, s[6:7] ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_min_u32 s2, s2, 64 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_min_u32 s0, s0, 64 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -799,16 +799,16 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_ctlz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -847,25 +847,25 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 @@ -873,15 +873,15 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] +; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 @@ -889,7 +889,7 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX11-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX11-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -926,16 +926,16 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_ctlz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v1 @@ -974,49 +974,49 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_ctlz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i64_trunc: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3] +; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v1, v1 ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX11-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1052,18 +1052,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_eq_neg1: @@ -1090,40 +1090,40 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1159,18 +1159,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_ne_neg1: @@ -1197,40 +1197,40 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1270,21 +1270,21 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth: @@ -1313,47 +1313,47 @@ define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1392,21 +1392,21 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth: @@ -1435,47 +1435,47 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1510,17 +1510,17 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i8_sel_eq_neg1: @@ -1552,22 +1552,22 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1578,18 +1578,18 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 -; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1624,25 +1624,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_ctlz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_add_u32_e32 v2, vcc, -16, v2 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i16_sel_eq_neg1: @@ -1674,25 +1674,25 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 @@ -1700,15 +1700,15 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v2, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX11-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1752,18 +1752,18 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctlz_i7_sel_eq_neg1: @@ -1795,23 +1795,23 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1824,19 +1824,19 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_clz_i32_u32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x7f, v0 -; GFX11-NEXT: global_store_b8 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b8 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index 756b819..97529b5 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -99,17 +99,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -134,14 +134,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -174,15 +174,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -211,15 +211,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -254,15 +254,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctlz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v3 ; VI-NEXT: v_ffbh_u32_e32 v2, v2 @@ -295,17 +295,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -534,13 +534,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_flbit_i32_b64 s0, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -562,14 +562,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s4, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone %ctlz_ret = icmp ne i64 %val, 0 @@ -602,18 +602,18 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0 ; VI-NEXT: v_ffbh_u32_e32 v1, v1 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -650,17 +650,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone @@ -698,14 +698,14 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 1 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) @@ -716,8 +716,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_add_u32_e32 v1, vcc, -16, v1 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -754,11 +754,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 @@ -766,7 +766,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone @@ -810,22 +810,22 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: s_add_u32 s0, s6, 3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s6, 2 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -841,8 +841,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v2, 32, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -870,13 +870,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -887,7 +887,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone @@ -947,43 +947,43 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 5 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_add_u32 s4, s2, 4 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 7 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s2, 6 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_add_u32 s4, s2, 3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v11, s5 -; VI-NEXT: v_mov_b32_e32 v10, s4 -; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: s_add_u32 s0, s6, 5 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s0, s6, 4 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s6, 7 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s6, 6 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: s_add_u32 s0, s6, 3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: s_add_u32 s0, s6, 2 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v11, s1 +; VI-NEXT: v_mov_b32_e32 v10, s0 +; VI-NEXT: s_add_u32 s0, s6, 1 ; VI-NEXT: flat_load_ubyte v12, v[0:1] ; VI-NEXT: flat_load_ubyte v13, v[2:3] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: flat_load_ubyte v6, v[8:9] -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_load_ubyte v7, v[10:11] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] @@ -1010,9 +1010,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_min_u32_e32 v0, v0, v3 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_min_u32_e32 v0, 64, v0 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1051,17 +1051,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1082,7 +1082,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v4, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -1114,17 +1114,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: v_ctlz_zero_undef_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1159,11 +1159,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1171,7 +1171,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1257,12 +1257,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: s_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_flbit_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_flbit_i32_b64 s0, s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1283,12 +1283,12 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_flbit_i32_b64 s2, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: s_flbit_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true) %trunc = trunc i64 %ctlz to i32 @@ -1319,16 +1319,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_ctlz_zero_undef_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 @@ -1365,17 +1365,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_add_u32_e32 v0, 32, v0 ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v1, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1409,16 +1409,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; VI-LABEL: v_ctlz_zero_undef_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v1 @@ -1455,17 +1455,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, v2, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -1499,17 +1499,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1535,16 +1535,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1578,17 +1578,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1614,16 +1614,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1656,16 +1656,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1698,11 +1698,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 ; GFX9-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1710,9 +1710,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3] -; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[0:1], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1751,15 +1751,15 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v3, v2 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 @@ -1800,17 +1800,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspa ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, vcc ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) @@ -1851,19 +1851,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1889,16 +1889,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1935,19 +1935,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1973,16 +1973,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noali ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2019,19 +2019,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2058,16 +2058,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -2104,19 +2104,19 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -2143,16 +2143,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1 ; ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll index b6359f1..4f2bde8 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -91,18 +91,18 @@ define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16: @@ -344,15 +344,15 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -360,7 +360,7 @@ define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addr ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_v2i16: @@ -430,15 +430,15 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0 @@ -452,7 +452,7 @@ define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addr ; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; VI-NEXT: v_or_b32_e32 v1, v1, v2 ; VI-NEXT: v_or_b32_e32 v0, v0, v3 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_v4i16: @@ -562,15 +562,17 @@ define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v8i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2 @@ -769,18 +771,20 @@ define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: v_ctpop_v16i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 ; VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2 @@ -1035,18 +1039,18 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noal ; ; VI-LABEL: v_ctpop_i16_add_inline_constant: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_inline_constant: @@ -1107,18 +1111,18 @@ define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) ; ; VI-LABEL: v_ctpop_i16_add_inline_constant_inv: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_inline_constant_inv: @@ -1180,19 +1184,19 @@ define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_ctpop_i16_add_literal: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; VI-NEXT: s_movk_i32 s4, 0x3e7 +; VI-NEXT: s_movk_i32 s0, 0x3e7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_bcnt_u32_b32 v0, v0, s4 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: v_bcnt_u32_b32 v0, v0, s0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_ctpop_i16_add_literal: diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll index 131ce14..633f120 100644 --- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll @@ -66,19 +66,19 @@ define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrsp ; ; VI-LABEL: v_ctpop_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -242,21 +242,21 @@ define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 ; VI-NEXT: v_bcnt_u32_b32 v1, v3, v2 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid @@ -298,18 +298,18 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; ; VI-LABEL: v_ctpop_v4i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 5, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[4:5] ; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v4 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_bcnt_u32_b32 v8, v2, 0 @@ -320,7 +320,7 @@ define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addr ; VI-NEXT: v_bcnt_u32_b32 v6, v6, 0 ; VI-NEXT: v_bcnt_u32_b32 v4, v5, v4 ; VI-NEXT: v_bcnt_u32_b32 v5, v7, v6 -; VI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[2:5], off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid @@ -507,22 +507,22 @@ define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_ctpop_i128: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_bcnt_u32_b32 v2, v2, 0 ; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0 ; VI-NEXT: v_bcnt_u32_b32 v2, v3, v2 ; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll index ee2894a..483402d 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz.ll @@ -111,19 +111,19 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32: @@ -148,28 +148,28 @@ define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -204,21 +204,21 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_v2i32: @@ -246,32 +246,32 @@ define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v2i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v2i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -310,15 +310,15 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; VI-LABEL: v_cttz_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v3, v3 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 @@ -328,7 +328,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; VI-NEXT: v_min_u32_e32 v2, 32, v2 ; VI-NEXT: v_min_u32_e32 v1, 32, v1 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_v4i32: @@ -362,11 +362,11 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; ; GFX10-LABEL: v_cttz_v4i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 @@ -376,16 +376,16 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_v4i32: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 @@ -395,7 +395,7 @@ define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrs ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v3, 32, v3 -; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -427,21 +427,21 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; VI-LABEL: v_cttz_i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v0, 0x100, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i8: @@ -475,26 +475,26 @@ define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; ; GFX10-LABEL: v_cttz_i8: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %valptr %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -598,16 +598,16 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; VI-LABEL: s_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_ff1_i32_b64 s0, s[2:3] -; VI-NEXT: s_min_u32 s0, s0, 64 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_ff1_i32_b64 s4, s[6:7] +; VI-NEXT: s_min_u32 s4, s4, 64 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: s_cttz_i64_trunc: @@ -629,24 +629,24 @@ define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 % ; ; GFX10-LABEL: s_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_ff1_i32_b64 s2, s[2:3] -; GFX10-NEXT: s_min_u32 s2, s2, 64 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX10-NEXT: s_min_u32 s0, s0, 64 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: s_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: s_ff1_i32_b64 s2, s[2:3] -; GFX10-GISEL-NEXT: s_min_u32 s2, s2, 64 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX10-GISEL-NEXT: s_min_u32 s0, s0, 64 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false) %trunc = trunc i64 %cttz to i32 @@ -678,16 +678,16 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; VI-LABEL: v_cttz_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v3 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v3 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3 +; VI-NEXT: v_mov_b32_e32 v4, s5 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v3 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 @@ -726,25 +726,25 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; ; GFX10-LABEL: v_cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp ; GFX10-NEXT: v_min3_u32 v0, v0, v1, 64 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 @@ -752,7 +752,7 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -787,16 +787,16 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; VI-LABEL: v_cttz_i64_trunc: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2] -; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0 -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0 +; VI-NEXT: v_mov_b32_e32 v4, s5 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v2 @@ -835,33 +835,33 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a ; ; GFX10-LABEL: v_cttz_i64_trunc: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min3_u32 v1, v1, v2, 64 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i64_trunc: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 3, v0 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[2:3] +; GFX10-GISEL-NEXT: global_load_dwordx2 v[1:2], v1, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid @@ -895,18 +895,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_eq_neg1: @@ -933,29 +933,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -989,18 +989,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_ne_neg1: @@ -1027,29 +1027,29 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v1, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1087,21 +1087,21 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_eq_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_eq_bitwidth: @@ -1130,32 +1130,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1192,21 +1192,21 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i32_sel_ne_bitwidth: @@ -1235,32 +1235,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 32, v0 ; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 32, v0 ; GFX10-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -1293,17 +1293,17 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i8_sel_eq_neg1: @@ -1335,32 +1335,32 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v1, 0x100, v0 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-GISEL-NEXT: v_cmp_eq_u32_sdwa s0, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s2 -; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[0:1] +; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, -1, s0 +; GFX10-GISEL-NEXT: global_store_byte v2, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid @@ -1393,24 +1393,24 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s10, s6 -; VI-NEXT: s_mov_b32 s11, s7 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s10, s2 +; VI-NEXT: s_mov_b32 s11, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s8, s2 -; VI-NEXT: s_mov_b32 s9, s3 +; VI-NEXT: s_mov_b32 s8, s6 +; VI-NEXT: s_mov_b32 s9, s7 ; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 ; VI-NEXT: v_mov_b32_e32 v1, 0xffff -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_or_b32_e32 v2, 0x10000, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i16_sel_eq_neg1: @@ -1442,31 +1442,31 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX10-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v1 ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo -; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[6:7] ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo -; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %valptr %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone @@ -1499,18 +1499,18 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i7_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: v_cttz_i7_sel_eq_neg1: @@ -1542,23 +1542,23 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX10-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s3 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s7 ; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v1, v0 ; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo ; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off @@ -1570,7 +1570,7 @@ define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 -; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] +; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll index 392a443..a6cbfa5 100644 --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -86,17 +86,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_zero_undef_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -121,14 +121,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 -; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid @@ -161,15 +161,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v1 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 @@ -198,15 +198,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -241,15 +241,15 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; VI-LABEL: v_cttz_zero_undef_v4i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v3, v3 ; VI-NEXT: v_ffbl_b32_e32 v2, v2 @@ -282,17 +282,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 -; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid @@ -510,13 +510,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: s_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ff1_i32_b64 s2, s[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_ff1_i32_b64 s0, s[6:7] +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -538,14 +538,14 @@ define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-GISEL-NEXT: s_mov_b32 s5, 0 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_mov_b32 s1, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_ff1_i32_b64 s4, s[2:3] -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: s_ff1_i32_b64 s0, s[6:7] +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone %cttz_ret = icmp ne i64 %val, 0 @@ -577,17 +577,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; VI-LABEL: v_cttz_zero_undef_i8_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -622,16 +622,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone @@ -668,14 +668,14 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i16_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 1 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: s_waitcnt vmcnt(1) @@ -685,8 +685,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; VI-NEXT: v_ffbl_b32_e32 v1, v0 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -721,18 +721,18 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone @@ -776,22 +776,22 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i32_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: s_add_u32 s0, s6, 3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s6, 2 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -807,8 +807,8 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v2, 32, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -836,13 +836,13 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -853,7 +853,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v2, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone @@ -913,43 +913,43 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; VI-LABEL: v_cttz_zero_undef_i64_with_select: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 5 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: s_add_u32 s4, s2, 4 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 7 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: s_add_u32 s4, s2, 6 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v7, s5 -; VI-NEXT: v_mov_b32_e32 v6, s4 -; VI-NEXT: s_add_u32 s4, s2, 3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v9, s5 -; VI-NEXT: v_mov_b32_e32 v8, s4 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v11, s5 -; VI-NEXT: v_mov_b32_e32 v10, s4 +; VI-NEXT: s_add_u32 s0, s6, 5 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_add_u32 s0, s6, 4 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s6, 7 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s6, 6 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: s_add_u32 s0, s6, 3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v9, s1 +; VI-NEXT: v_mov_b32_e32 v8, s0 +; VI-NEXT: s_add_u32 s0, s6, 2 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v11, s1 +; VI-NEXT: v_mov_b32_e32 v10, s0 ; VI-NEXT: flat_load_ubyte v12, v[0:1] ; VI-NEXT: flat_load_ubyte v13, v[2:3] ; VI-NEXT: flat_load_ubyte v4, v[4:5] ; VI-NEXT: flat_load_ubyte v5, v[6:7] -; VI-NEXT: s_add_u32 s4, s2, 1 +; VI-NEXT: s_add_u32 s0, s6, 1 ; VI-NEXT: flat_load_ubyte v6, v[8:9] -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_load_ubyte v7, v[10:11] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] @@ -976,9 +976,9 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; VI-NEXT: v_or_b32_e32 v0, v4, v0 ; VI-NEXT: v_ffbl_b32_e32 v0, v0 ; VI-NEXT: v_min_u32_e32 v0, v3, v0 -; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_min_u32_e32 v0, 64, v0 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -1017,17 +1017,17 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[2:3] offset:2 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[2:3] offset:4 -; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[2:3] offset:5 -; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[2:3] offset:6 -; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[2:3] offset:7 +; GFX9-GISEL-NEXT: global_load_ubyte v0, v1, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v1, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v1, s[6:7] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v1, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v5, v1, s[6:7] offset:4 +; GFX9-GISEL-NEXT: global_load_ubyte v6, v1, s[6:7] offset:5 +; GFX9-GISEL-NEXT: global_load_ubyte v7, v1, s[6:7] offset:6 +; GFX9-GISEL-NEXT: global_load_ubyte v8, v1, s[6:7] offset:7 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(6) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 8, v0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(5) @@ -1048,7 +1048,7 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; GFX9-GISEL-NEXT: v_min_u32_e32 v0, v0, v4 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc -; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-GISEL-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -1091,22 +1091,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: s_add_u32 s0, s6, 3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s6, 2 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -1121,8 +1121,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1152,13 +1152,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1170,7 +1170,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1213,22 +1213,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; VI-LABEL: v_cttz_i32_sel_ne_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: s_add_u32 s0, s6, 3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s6, 2 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -1243,8 +1243,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; VI-NEXT: v_or_b32_e32 v0, v2, v0 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_ffbl_b32_e32 v2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1274,13 +1274,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1292,7 +1292,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1338,22 +1338,22 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i32_sel_ne_bitwidth: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 3 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: s_add_u32 s4, s2, 2 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_add_u32 s2, s2, 1 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 -; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: s_add_u32 s0, s6, 3 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_add_u32 s0, s6, 2 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[6:7] @@ -1371,8 +1371,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_min_u32_e32 v0, 32, v0 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 32, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1404,13 +1404,13 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 -; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[2:3] offset:2 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-GISEL-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(2) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(1) @@ -1422,7 +1422,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 32, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v1, vcc -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i32, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone @@ -1453,10 +1453,10 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i8_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, 0xff ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1464,8 +1464,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1498,18 +1498,18 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_or_b32_e32 v3, 0x100, v1 ; GFX9-GISEL-NEXT: v_ffbl_b32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i8, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone @@ -1544,14 +1544,14 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; VI-LABEL: v_cttz_i16_sel_eq_neg1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s2, 1 -; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_add_u32 s0, s6, 1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: v_mov_b32_e32 v1, 0xffff @@ -1563,8 +1563,8 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; VI-NEXT: v_ffbl_b32_e32 v2, v2 ; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 ; VI-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -1597,12 +1597,12 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; ; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] -; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 +; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[6:7] +; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[6:7] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 ; GFX9-GISEL-NEXT: v_or_b32_e32 v2, 0x10000, v1 @@ -1610,7 +1610,7 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc -; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-GISEL-NEXT: s_endpgm %val = load i16, ptr addrspace(1) %arrayidx, align 1 %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 3f513e1..fd4e182 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -918,50 +918,50 @@ define amdgpu_kernel void @load_i8_to_f32(ptr addrspace(1) noalias %out, ptr add ; ; VI-LABEL: load_i8_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_i8_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_i8_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_i8_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -996,58 +996,58 @@ define amdgpu_kernel void @load_v2i8_to_v2f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v2i8_to_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v2i8_to_v2f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v2i8_to_v2f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v0, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v2i8_to_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u16 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1084,61 +1084,61 @@ define amdgpu_kernel void @load_v3i8_to_v3f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v3i8_to_v3f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v3i8_to_v3f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v3i8_to_v3f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1] +; GFX9-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v3i8_to_v3f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1175,66 +1175,66 @@ define amdgpu_kernel void @load_v4i8_to_v4f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v4i8_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0 ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1281,11 +1281,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1297,8 +1297,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v1, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -1307,20 +1307,20 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -1329,19 +1329,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -1350,19 +1350,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 -; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 -; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1 -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:2 +; GFX11-NEXT: global_load_u8 v4, v0, s[6:7] offset:1 +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(2) @@ -1371,7 +1371,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v5, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1426,15 +1426,16 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; ; VI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s8, 0x4000405 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v0 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_add_u32_e32 v4, vcc, s6, v0 +; VI-NEXT: v_mov_b32_e32 v1, s11 +; VI-NEXT: v_add_u32_e32 v4, vcc, s10, v0 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 3, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc @@ -1448,12 +1449,11 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] ; VI-NEXT: flat_load_ubyte v4, v[0:1] -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 +; VI-NEXT: s_mov_b32 s8, 0x4000405 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_lshlrev_b32_e32 v5, 8, v6 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v6 @@ -1467,21 +1467,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; VI-NEXT: v_or_b32_e32 v5, v7, v3 ; VI-NEXT: v_mov_b32_e32 v3, v1 ; VI-NEXT: v_perm_b32 v4, v4, v5, s8 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 -; VI-NEXT: buffer_store_dword v4, off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v4, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 -; GFX10-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 -; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3 +; GFX10-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(2) ; GFX10-NEXT: v_lshl_or_b32 v5, v3, 8, v1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 @@ -1491,21 +1491,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_perm_b32 v4, v5, v6, 0x4000405 -; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[0:1] -; GFX10-NEXT: global_store_dword v7, v4, s[2:3] +; GFX10-NEXT: global_store_dwordx4 v7, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dword v7, v4, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: s_mov_b32 s0, 0x4000405 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:2 -; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:3 -; GFX9-NEXT: global_load_ubyte v3, v0, s[4:5] offset:3 -; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:2 -; GFX9-NEXT: s_mov_b32 s4, 0x4000405 +; GFX9-NEXT: global_load_ubyte v1, v0, s[8:9] offset:2 +; GFX9-NEXT: global_load_ubyte v2, v0, s[10:11] offset:3 +; GFX9-NEXT: global_load_ubyte v3, v0, s[8:9] offset:3 +; GFX9-NEXT: global_load_ubyte v4, v0, s[10:11] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshl_or_b32 v6, v3, 8, v1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 @@ -1514,22 +1514,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_perm_b32 v4, v6, v7, s4 -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dword v5, v4, s[2:3] +; GFX9-NEXT: v_perm_b32 v4, v6, v7, s0 +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dword v5, v4, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_unaligned_multiuse: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_load_u8 v1, v0, s[4:5] offset:2 -; GFX11-NEXT: global_load_u8 v3, v0, s[4:5] offset:3 -; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:3 -; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] offset:2 +; GFX11-NEXT: global_load_u8 v1, v0, s[8:9] offset:2 +; GFX11-NEXT: global_load_u8 v3, v0, s[8:9] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[10:11] offset:3 +; GFX11-NEXT: global_load_u8 v0, v0, s[10:11] offset:2 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 8, v1 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 @@ -1541,8 +1541,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_perm_b32 v4, v4, v5, 0x4000405 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] -; GFX11-NEXT: global_store_b32 v6, v4, s[2:3] +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b32 v6, v4, s[6:7] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1605,23 +1605,22 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-LABEL: load_v4i8_to_v4f32_2_uses: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: v_mov_b32_e32 v5, 0xffffff00 +; VI-NEXT: v_mov_b32_e32 v6, 9 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 ; VI-NEXT: v_mov_b32_e32 v7, 0x900 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s2 -; VI-NEXT: s_mov_b32 s5, s3 -; VI-NEXT: s_mov_b32 s2, s6 -; VI-NEXT: s_mov_b32 s3, s7 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1631,24 +1630,24 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; VI-NEXT: v_add_u16_e32 v9, 9, v4 ; VI-NEXT: v_and_b32_sdwa v5, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_nop 0 ; VI-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_or_b32_sdwa v1, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; VI-NEXT: v_add_u16_e32 v0, 0x900, v0 ; VI-NEXT: v_add_u16_sdwa v1, v1, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX10: ; %bb.0: +; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] -; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0 @@ -1666,22 +1665,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX10-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] -; GFX10-NEXT: global_store_dword v4, v5, s[2:3] +; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; GFX10-NEXT: global_store_dword v4, v5, s[6:7] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_movk_i32 s4, 0xff00 ; GFX9-NEXT: v_mov_b32_e32 v6, 9 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v4, v0, s[0:1] -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 -; GFX9-NEXT: s_movk_i32 s5, 0x900 +; GFX9-NEXT: s_movk_i32 s0, 0xff00 +; GFX9-NEXT: s_movk_i32 s1, 0x900 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 @@ -1689,26 +1687,26 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4 ; GFX9-NEXT: v_add_u16_e32 v8, 9, v4 -; GFX9-NEXT: v_and_b32_sdwa v9, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_sdwa v9, v4, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] ; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_or_b32_sdwa v1, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0 -; GFX9-NEXT: v_add_u16_sdwa v1, v1, s5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_sdwa v1, v1, s1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v5, v0, s[2:3] +; GFX9-NEXT: global_store_dword v5, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-NEXT: v_add_nc_u16 v2, v0, 9 @@ -1734,10 +1732,9 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] -; GFX11-NEXT: global_store_b32 v4, v5, s[2:3] +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] +; GFX11-NEXT: global_store_b32 v4, v5, s[6:7] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1794,11 +1791,11 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v7i8_to_v7f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 5, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1819,8 +1816,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v9, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(6) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v5, v10 ; VI-NEXT: s_waitcnt vmcnt(4) @@ -1833,23 +1830,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_cvt_f32_ubyte0_e32 v6, v8 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v9 -; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx3 v[4:6], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v7i8_to_v7f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x5 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_short_d16 v7, v0, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_short_d16 v7, v0, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(5) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX10-NEXT: s_waitcnt vmcnt(4) @@ -1863,22 +1860,22 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v4, v7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx3 v8, v[4:6], s[4:5] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v7i8_to_v7f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:6 -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v3, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v7, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v8, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v9, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:6 +; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v7, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v8, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v9, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(5) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v6, v1 ; GFX9-NEXT: s_waitcnt vmcnt(4) @@ -1892,23 +1889,23 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v9 -; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] -; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[0:1] offset:16 +; GFX9-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] +; GFX9-NEXT: global_store_dwordx3 v10, v[4:6], s[4:5] offset:16 ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v7i8_to_v7f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:6 -; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 -; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 -; GFX11-NEXT: global_load_u8 v5, v0, s[2:3] offset:1 -; GFX11-NEXT: global_load_d16_b16 v7, v0, s[2:3] offset:4 -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v4, v0, s[6:7] offset:6 +; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:2 +; GFX11-NEXT: global_load_u8 v5, v0, s[6:7] offset:1 +; GFX11-NEXT: global_load_d16_b16 v7, v0, s[6:7] offset:4 +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, v4 ; GFX11-NEXT: s_waitcnt vmcnt(4) @@ -1923,8 +1920,8 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b96 v8, v[4:6], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b96 v8, v[4:6], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1966,15 +1963,15 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: load_v8i8_to_v8f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dwordx2 v[7:8], v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -1984,17 +1981,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; VI-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; VI-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: load_v8i8_to_v8f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX10-NEXT: v_mov_b32_e32 v10, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] +; GFX10-NEXT: global_load_dwordx2 v[8:9], v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 @@ -2004,17 +2001,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v2, v8 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] offset:16 -; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[4:5] offset:16 +; GFX10-NEXT: global_store_dwordx4 v10, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: load_v8i8_to_v8f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[7:8], v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v7 ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v7 @@ -2024,17 +2021,17 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v6, v8 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v5, v8 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v4, v8 -; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[0:1] offset:16 -; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v9, v[4:7], s[4:5] offset:16 +; GFX9-NEXT: global_store_dwordx4 v9, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: load_v8i8_to_v8f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX11-NEXT: v_mov_b32_e32 v10, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[8:9], v0, s[2:3] +; GFX11-NEXT: global_load_b64 v[8:9], v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v7, v9 ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v6, v9 @@ -2045,8 +2042,8 @@ define amdgpu_kernel void @load_v8i8_to_v8f32(ptr addrspace(1) noalias %out, ptr ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v8 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v8 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v10, v[4:7], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v10, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v10, v[4:7], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v10, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2081,58 +2078,58 @@ define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: i8_zext_inreg_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_u32_e32 v0, 2, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2168,53 +2165,53 @@ define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: i8_zext_inreg_hi1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_inreg_hi1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2251,50 +2248,50 @@ define amdgpu_kernel void @i8_zext_i32_to_f32(ptr addrspace(1) noalias %out, ptr ; ; VI-LABEL: i8_zext_i32_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: i8_zext_i32_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: i8_zext_i32_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: i8_zext_i32_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2338,11 +2335,11 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; ; VI-LABEL: v4i8_zext_v4i32_to_v4f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -2354,8 +2351,8 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v1, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(3) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v2 ; VI-NEXT: s_waitcnt vmcnt(2) @@ -2364,20 +2361,20 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 -; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(3) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX10-NEXT: s_waitcnt vmcnt(2) @@ -2386,19 +2383,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] offset:2 -; GFX9-NEXT: global_load_ubyte v4, v0, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] offset:2 +; GFX9-NEXT: global_load_ubyte v4, v0, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(3) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX9-NEXT: s_waitcnt vmcnt(2) @@ -2407,19 +2404,19 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v5 -; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: v4i8_zext_v4i32_to_v4f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 -; GFX11-NEXT: global_load_u8 v1, v0, s[2:3] offset:3 -; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] offset:2 -; GFX11-NEXT: global_load_u8 v4, v0, s[2:3] offset:1 -; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] +; GFX11-NEXT: global_load_u8 v1, v0, s[6:7] offset:3 +; GFX11-NEXT: global_load_u8 v2, v0, s[6:7] offset:2 +; GFX11-NEXT: global_load_u8 v4, v0, s[6:7] offset:1 +; GFX11-NEXT: global_load_u8 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(3) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v3, v1 ; GFX11-NEXT: s_waitcnt vmcnt(2) @@ -2428,7 +2425,7 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(ptr addrspace(1) noalias %ou ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b128 v5, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v5, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2463,53 +2460,53 @@ define amdgpu_kernel void @extract_byte0_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte0_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte0_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte0_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte0_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2544,53 +2541,53 @@ define amdgpu_kernel void @extract_byte1_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte1_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte1_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte1_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte1_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2626,53 +2623,53 @@ define amdgpu_kernel void @extract_byte2_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte2_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte2_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte2_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte2_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2708,53 +2705,53 @@ define amdgpu_kernel void @extract_byte3_to_f32(ptr addrspace(1) noalias %out, p ; ; VI-LABEL: extract_byte3_to_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: extract_byte3_to_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v0, v0, s[2:3] +; GFX10-NEXT: global_load_dword v0, v0, s[6:7] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX9-LABEL: extract_byte3_to_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[2:3] +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: extract_byte3_to_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] +; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v0, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2825,16 +2822,16 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr ; ; GFX9-LABEL: cvt_ubyte0_or_multiuse: ; GFX9: ; %bb.0: ; %bb -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v0, s[0:1] +; GFX9-NEXT: global_load_dword v0, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_or_b32_e32 v0, 0x80000001, v0 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; GFX9-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dword v1, v0, s[2:3] +; GFX9-NEXT: global_store_dword v1, v0, s[6:7] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: cvt_ubyte0_or_multiuse: diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 8f31bb1..67b0cef 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -376,32 +376,32 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_LH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_lh_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_pack_lh_b32_b16 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_LH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_lh_b32_b16 s2, s2, s3 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_pack_lh_b32_b16 s0, s6, s7 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[4:5] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_LH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_lh_b32_b16 s2, s2, s3 +; GFX11-NEXT: s_pack_lh_b32_b16 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -466,32 +466,32 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32 ; ; GFX9-LABEL: uniform_vec_i16_HH: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_pack_hh_b32_b16 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_pack_hh_b32_b16 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX906-LABEL: uniform_vec_i16_HH: ; GFX906: ; %bb.0: -; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX906-NEXT: v_mov_b32_e32 v0, 0 ; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: s_pack_hh_b32_b16 s2, s2, s3 -; GFX906-NEXT: v_mov_b32_e32 v1, s2 -; GFX906-NEXT: global_store_dword v0, v1, s[0:1] +; GFX906-NEXT: s_pack_hh_b32_b16 s0, s6, s7 +; GFX906-NEXT: v_mov_b32_e32 v1, s0 +; GFX906-NEXT: global_store_dword v0, v1, s[4:5] ; GFX906-NEXT: s_endpgm ; ; GFX11-LABEL: uniform_vec_i16_HH: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_pack_hh_b32_b16 s2, s2, s3 +; GFX11-NEXT: s_pack_hh_b32_b16 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index 777a8f3..4cef701 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -319,16 +319,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 @@ -370,16 +370,16 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) % ; ; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 ; GFX9-NEXT: ds_read_b32 v1, v1 offset:32 ; GFX9-NEXT: ds_read_b32 v2, v2 offset:32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll index 70011e5..44d65c9 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll @@ -208,30 +208,30 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x ; ; VI-LABEL: extract_vector_elt_v3f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: extract_vector_elt_v3f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s6 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:2 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -278,16 +278,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou ; GFX11-LABEL: dynamic_extract_vector_elt_v3f16: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s4, s4, 4 +; GFX11-NEXT: s_lshl_b32 s0, s2, 4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], s0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index f34824c..c35e1e2 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -212,14 +212,14 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; ; GFX11-LABEL: s_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff7fff -; GFX11-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s0, s6, 0x7fff7fff +; GFX11-NEXT: s_and_b32 s1, s7, 0x7fff7fff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll index 07581ad..8f0d639 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.ll @@ -113,14 +113,14 @@ define amdgpu_kernel void @fabs_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; VI-LABEL: fabs_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset0_b32 s3, 31 -; VI-NEXT: s_bitset0_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_and_b32 s0, s7, 0x7fffffff +; VI-NEXT: s_and_b32 s1, s6, 0x7fffffff +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) @@ -185,12 +185,12 @@ define amdgpu_kernel void @fabsf_fn_fold(ptr addrspace(1) %out, float %in0, floa ; ; VI-LABEL: fabsf_fn_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mul_f32_e64 v2, |s6|, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @fabsf(float %in0) @@ -215,12 +215,12 @@ define amdgpu_kernel void @fabs_fold(ptr addrspace(1) %out, float %in0, float %i ; ; VI-LABEL: fabs_fold: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_mul_f32_e64 v2, |s2|, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_mul_f32_e64 v2, |s6|, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %in0) diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll index d53c041..bdd08aa6 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -123,11 +123,11 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl ; ; GFX12-LABEL: s_test_canonicalize_var_f32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x0 ; GFX12-NEXT: v_mov_b32_e32 v0, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: v_max_num_f32_e64 v1, s6, s6 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index b893691..fd80580 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1811,60 +1811,59 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half ; ; VI-LABEL: s_copysign_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_movk_i32 s4, 0x7fff +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_movk_i32 s0, 0x7fff ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_lshr_b32 s3, s3, 16 -; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: v_bfi_b32 v0, s4, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_lshr_b32 s1, s7, 16 +; VI-NEXT: s_lshr_b32 s2, s6, 16 +; VI-NEXT: v_bfi_b32 v0, s0, v0, v1 ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: v_bfi_b32 v1, s4, v1, v2 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_bfi_b32 v1, s0, v1, v2 ; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_copysign_v2f16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_movk_i32 s0, 0x7fff ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_lshr_b32 s3, s3, 16 -; GFX9-NEXT: s_lshr_b32 s2, s2, 16 -; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: s_lshr_b32 s1, s7, 16 +; GFX9-NEXT: s_lshr_b32 s2, s6, 16 +; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: s_copysign_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-NEXT: s_lshr_b32 s3, s3, 16 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0 -; GFX11-NEXT: s_lshr_b32 s2, s2, 16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_lshr_b32 s0, s7, 16 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: s_lshr_b32 s0, s6, 16 +; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll index f48961c..fb04b66 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll @@ -21,25 +21,25 @@ define amdgpu_kernel void @s_test_copysign_f32(ptr addrspace(1) %out, float %mag ; ; VI-LABEL: s_test_copysign_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s4, -2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s0, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_bfi_b32 v2, s4, v0, v1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_bfi_b32 v2, s0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s6, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1018,26 +1018,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_f16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s4, -2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s0, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_bfi_b32 v2, s4, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s7 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_bfi_b32 v2, s0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fpext_f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 +; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s6, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1116,26 +1116,26 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out, ; ; VI-LABEL: s_test_copysign_f32_fpext_bf16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_brev_b32 s4, -2 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_brev_b32 s0, -2 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3 -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: v_bfi_b32 v2, s4, v1, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s7 +; VI-NEXT: v_mov_b32_e32 v1, s6 +; VI-NEXT: v_bfi_b32 v2, s0, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_copysign_f32_fpext_bf16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3 +; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s6, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll index 0468175..b639768 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -94,12 +94,12 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s6, v2, s6 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -110,18 +110,18 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v2, v1, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f32 v2, v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_f32_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -133,16 +133,16 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s6 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -155,8 +155,8 @@ define amdgpu_kernel void @s_fdiv_f32_ninf(ptr addrspace(1) %out, float %a, floa ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -253,12 +253,12 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; ; GFX8-LABEL: s_fdiv_f32_ieee: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s6, v2, s6 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX8-NEXT: v_fma_f32 v3, v4, v3, v3 @@ -267,49 +267,49 @@ define amdgpu_kernel void @s_fdiv_f32_ieee(ptr addrspace(1) %out, float %a, floa ; GFX8-NEXT: v_fma_f32 v4, v5, v3, v4 ; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v2, v1, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f32 v2, v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_f32_ieee: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_ieee: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -353,48 +353,48 @@ define amdgpu_kernel void @s_fdiv_25ulp_f32(ptr addrspace(1) %out, float %a, flo ; ; GFX8-LABEL: s_fdiv_25ulp_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v0, 0x6f800000 ; GFX8-NEXT: v_mov_b32_e32 v1, 0x2f800000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e64 vcc, |s3|, v0 +; GFX8-NEXT: v_cmp_gt_f32_e64 vcc, |s7|, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc -; GFX8-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX8-NEXT: v_mul_f32_e32 v1, s7, v0 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX8-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX8-NEXT: v_mul_f32_e32 v2, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_25ulp_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| -; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4 -; GFX10-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX10-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7| +; GFX10-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0 +; GFX10-NEXT: v_mul_f32_e32 v1, s7, v0 ; GFX10-NEXT: v_rcp_f32_e32 v1, v1 -; GFX10-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX10-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e64 s4, 0x6f800000, |s3| -; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s4 -; GFX11-NEXT: v_mul_f32_e32 v1, s3, v0 +; GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0x6f800000, |s7| +; GFX11-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x2f800000, s0 +; GFX11-NEXT: v_mul_f32_e32 v1, s7, v0 ; GFX11-NEXT: v_rcp_f32_e32 v1, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX11-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -465,52 +465,52 @@ define amdgpu_kernel void @s_fdiv_25ulp_ieee_f32(ptr addrspace(1) %out, float %a ; ; GFX8-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s3 +; GFX8-NEXT: v_frexp_mant_f32_e32 v1, s7 ; GFX8-NEXT: v_rcp_f32_e32 v1, v1 -; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v0, s3 -; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v2, s2 -; GFX8-NEXT: v_frexp_mant_f32_e32 v3, s2 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v0, s7 +; GFX8-NEXT: v_frexp_exp_i32_f32_e32 v2, s6 +; GFX8-NEXT: v_frexp_mant_f32_e32 v3, s6 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v2, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, v3, v1 ; GFX8-NEXT: v_ldexp_f32 v2, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s3 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 -; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s2 -; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX10-NEXT: v_frexp_mant_f32_e32 v0, s7 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX10-NEXT: v_frexp_mant_f32_e32 v2, s6 +; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 ; GFX10-NEXT: v_rcp_f32_e32 v0, v0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, v3, v1 ; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX10-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-NEXT: global_store_dword v2, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_25ulp_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s3 -; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s3 -; GFX11-NEXT: v_frexp_mant_f32_e32 v2, s2 -; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v3, s2 +; GFX11-NEXT: v_frexp_mant_f32_e32 v0, s7 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v1, s7 +; GFX11-NEXT: v_frexp_mant_f32_e32 v2, s6 +; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v3, s6 ; GFX11-NEXT: v_rcp_f32_e32 v0, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, v3, v1 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -548,33 +548,33 @@ define amdgpu_kernel void @s_fdiv_fast_ieee_f32(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_fast_ieee_f32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_rcp_f32_e32 v0, s7 +; GFX8-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_fast_ieee_f32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_fast_ieee_f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -612,33 +612,33 @@ define amdgpu_kernel void @s_fdiv_f32_fast_math(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_rcp_f32_e32 v0, s7 +; GFX8-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -676,33 +676,33 @@ define amdgpu_kernel void @s_fdiv_ulp25_f32_fast_math(ptr addrspace(1) %out, flo ; ; GFX8-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_rcp_f32_e32 v0, s7 +; GFX8-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_ulp25_f32_fast_math: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -805,12 +805,12 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_daz: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_div_scale_f32 v1, s[4:5], s3, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s3 -; GFX8-NEXT: v_div_scale_f32 v2, vcc, s2, v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_div_scale_f32 v1, s[0:1], s7, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_div_scale_f32 v2, vcc, s6, v2, s6 ; GFX8-NEXT: v_rcp_f32_e32 v3, v1 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 ; GFX8-NEXT: v_fma_f32 v4, -v1, v3, 1.0 @@ -821,18 +821,18 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX8-NEXT: v_fma_f32 v1, -v1, v4, v2 ; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v4 -; GFX8-NEXT: v_div_fixup_f32 v2, v1, s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_div_fixup_f32 v2, v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_f32_arcp_daz: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 -; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX10-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 +; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX10-NEXT: v_rcp_f32_e32 v1, v0 ; GFX10-NEXT: s_denorm_mode 15 ; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0 @@ -844,16 +844,16 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX10-NEXT: s_denorm_mode 12 ; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_daz: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2 -; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s6 +; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX11-NEXT: v_rcp_f32_e32 v1, v0 ; GFX11-NEXT: s_denorm_mode 15 ; GFX11-NEXT: s_waitcnt_depctr 0xfff @@ -866,8 +866,8 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, ; GFX11-NEXT: s_denorm_mode 12 ; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -905,33 +905,33 @@ define amdgpu_kernel void @s_fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, ; ; GFX8-LABEL: s_fdiv_f32_arcp_ninf: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_rcp_f32_e32 v0, s3 -; GFX8-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_rcp_f32_e32 v0, s7 +; GFX8-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: s_fdiv_f32_arcp_ninf: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_rcp_f32_e32 v0, s3 -; GFX10-NEXT: v_mul_f32_e32 v0, s2, v0 -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_rcp_f32_e32 v0, s7 +; GFX10-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_fdiv_f32_arcp_ninf: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rcp_f32_e32 v0, s3 +; GFX11-NEXT: v_rcp_f32_e32 v0, s7 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s2, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, s6, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll index e445729..ab3650f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -4300,14 +4300,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN2-LABEL: atomic_cmpxchg_i32_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s0, s0, 16 -; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_add_u32 s0, s4, 16 +; GCN2-NEXT: s_addc_u32 s1, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4315,12 +4315,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old ; ; GCN3-LABEL: atomic_cmpxchg_i32_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4557,12 +4557,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN2-LABEL: atomic_cmpxchg_i32: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_mov_b32_e32 v0, s0 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: v_mov_b32_e32 v1, s1 -; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4570,12 +4570,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) { ; ; GCN3-LABEL: atomic_cmpxchg_i32: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: v_mov_b32_e32 v0, s0 -; GCN3-NEXT: v_mov_b32_e32 v2, s2 -; GCN3-NEXT: v_mov_b32_e32 v1, s1 -; GCN3-NEXT: v_mov_b32_e32 v3, s3 +; GCN3-NEXT: v_mov_b32_e32 v0, s4 +; GCN3-NEXT: v_mov_b32_e32 v2, s6 +; GCN3-NEXT: v_mov_b32_e32 v1, s5 +; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 5bd5271..816142d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -3853,13 +3853,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_max_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s3, 31 -; GCN2-NEXT: s_mov_b32 s4, s3 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s1, s7, 31 +; GCN2-NEXT: s_mov_b32 s0, s7 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -3869,7 +3869,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -3883,13 +3883,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_max_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -3897,7 +3897,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB88_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4057,13 +4057,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN2-LABEL: atomic_max_i32_addr64: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s3, 31 -; GCN2-NEXT: s_mov_b32 s4, s3 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s1, s7, 31 +; GCN2-NEXT: s_mov_b32 s0, s7 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 ; GCN2-NEXT: v_mov_b32_e32 v1, s1 ; GCN2-NEXT: flat_load_dword v3, v[0:1] @@ -4071,7 +4071,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN2-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -4085,13 +4085,13 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; ; GCN3-LABEL: atomic_max_i32_addr64: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] @@ -4099,7 +4099,7 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i32 %index) ; GCN3-NEXT: .LBB90_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -4996,13 +4996,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN2-LABEL: atomic_umax_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s3, 31 -; GCN2-NEXT: s_mov_b32 s4, s3 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s1, s7, 31 +; GCN2-NEXT: s_mov_b32 s0, s7 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -5012,7 +5012,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN2-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -5026,13 +5026,13 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; ; GCN3-LABEL: atomic_umax_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -5040,7 +5040,7 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i32 ; GCN3-NEXT: .LBB102_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_max_u32_e32 v2, s2, v3 +; GCN3-NEXT: v_max_u32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol @@ -6790,13 +6790,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN2-LABEL: atomic_min_i32_addr64_offset: ; GCN2: ; %bb.0: ; %entry -; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_ashr_i32 s5, s3, 31 -; GCN2-NEXT: s_mov_b32 s4, s3 -; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN2-NEXT: s_add_u32 s0, s0, s4 -; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_ashr_i32 s1, s7, 31 +; GCN2-NEXT: s_mov_b32 s0, s7 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 ; GCN2-NEXT: s_add_u32 s0, s0, 16 ; GCN2-NEXT: s_addc_u32 s1, s1, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s0 @@ -6806,7 +6806,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN2-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -6820,13 +6820,13 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; ; GCN3-LABEL: atomic_min_i32_addr64_offset: ; GCN3: ; %bb.0: ; %entry -; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN3-NEXT: s_waitcnt lgkmcnt(0) -; GCN3-NEXT: s_ashr_i32 s5, s3, 31 -; GCN3-NEXT: s_mov_b32 s4, s3 -; GCN3-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN3-NEXT: s_add_u32 s0, s0, s4 -; GCN3-NEXT: s_addc_u32 s1, s1, s5 +; GCN3-NEXT: s_ashr_i32 s1, s7, 31 +; GCN3-NEXT: s_mov_b32 s0, s7 +; GCN3-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GCN3-NEXT: s_add_u32 s0, s4, s0 +; GCN3-NEXT: s_addc_u32 s1, s5, s1 ; GCN3-NEXT: v_mov_b32_e32 v0, s0 ; GCN3-NEXT: v_mov_b32_e32 v1, s1 ; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 @@ -6834,7 +6834,7 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i32 % ; GCN3-NEXT: .LBB125_1: ; %atomicrmw.start ; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN3-NEXT: v_min_i32_e32 v2, s2, v3 +; GCN3-NEXT: v_min_i32_e32 v2, s6, v3 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index bac2d8b8..2a9a9ef 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -41,17 +41,17 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -101,22 +101,22 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] -; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -166,17 +166,17 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p ; ; GFX11-LABEL: combine_to_fma_f64_1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -220,17 +220,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_0_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[4:5] -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -280,22 +280,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_f64_0_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[4:5], v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7] -; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -345,17 +345,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[4:5] -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -405,22 +405,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali ; ; GFX11-LABEL: combine_to_fma_fsub_1_f64_2use: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], v[6:7] -; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -470,17 +470,17 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v6, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v6, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[4:5] -; GFX11-NEXT: global_store_b64 v6, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -532,22 +532,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_neg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], -v[6:7] -; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -605,22 +605,22 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1) ; ; GFX11-LABEL: combine_to_fma_fsub_2_f64_2uses_mul: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 3, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b64 v[0:1], v8, s[2:3] glc dlc +; GFX11-NEXT: global_load_b64 v[0:1], v8, s[6:7] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[2:3], v8, s[2:3] offset:8 glc dlc +; GFX11-NEXT: global_load_b64 v[2:3], v8, s[6:7] offset:8 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[4:5], v8, s[2:3] offset:16 glc dlc +; GFX11-NEXT: global_load_b64 v[4:5], v8, s[6:7] offset:16 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b64 v[6:7], v8, s[2:3] offset:24 glc dlc +; GFX11-NEXT: global_load_b64 v[6:7], v8, s[6:7] offset:24 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], -v[4:5] ; GFX11-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], -v[6:7] -; GFX11-NEXT: global_store_b64 v8, v[4:5], s[0:1] dlc +; GFX11-NEXT: global_store_b64 v8, v[4:5], s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b64 v8, v[0:1], s[0:1] offset:8 dlc +; GFX11-NEXT: global_store_b64 v8, v[0:1], s[4:5] offset:8 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -703,47 +703,47 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[6:7] glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7] ; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] -; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[4:5] ; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[6:7] glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f64 v[4:5], v[6:7], v[8:9], -v[4:5] ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[4:5] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm @@ -825,47 +825,47 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) ; ; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-NOFMA: ; %bb.0: -; GFX11-NOFMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NOFMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v10, s[6:7] glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-NOFMA-NEXT: global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc ; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-NOFMA-NEXT: v_mul_f64 v[6:7], v[6:7], v[8:9] ; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NOFMA-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7] ; GFX11-NOFMA-NEXT: v_add_f64 v[0:1], v[0:1], -v[2:3] -; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-NOFMA-NEXT: global_store_b64 v10, v[0:1], s[4:5] ; GFX11-NOFMA-NEXT: s_nop 0 ; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NOFMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v10, 3, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[2:3] glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[0:1], v10, s[6:7] glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[2:3], v10, s[6:7] offset:8 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[4:5], v10, s[6:7] offset:16 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[6:7], v10, s[6:7] offset:24 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc +; GFX11-FMA-NEXT: global_load_b64 v[8:9], v10, s[6:7] offset:32 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[6:7], v[8:9], v[0:1] ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[4:5], v[0:1] -; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[0:1] +; GFX11-FMA-NEXT: global_store_b64 v10, v[0:1], s[4:5] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll index 7830c91..0cb6b78 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -46,11 +46,11 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_fadd_use_test_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v0, s3, -1.0 -; GFX11-NEXT: v_add_f32_e64 v1, s2, -1.0 +; GFX11-NEXT: v_add_f32_e64 v0, s7, -1.0 +; GFX11-NEXT: v_add_f32_e64 v1, s6, -1.0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, |v0|, |v1| ; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo @@ -58,7 +58,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_f32_e32 v1, v0, v0 ; GFX11-NEXT: v_fma_f32 v0, -v1, v0, 1.0 -; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -174,14 +174,14 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo ; ; GFX11-LABEL: multiple_use_fadd_fmad_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_f32_e64 v1, |s2|, |s2| -; GFX11-NEXT: v_fma_f32 v2, |s2|, 2.0, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-NEXT: v_add_f32_e64 v1, |s6|, |s6| +; GFX11-NEXT: v_fma_f32 v2, |s6|, 2.0, s7 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] offset:4 dlc +; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll index 718be90..bde0dc3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1339,11 +1339,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1355,8 +1355,8 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_mad_f16 v2, v7, v2, -v3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 @@ -1364,11 +1364,11 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1380,8 +1380,8 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, -v3 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 @@ -1389,102 +1389,102 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, ; ; GFX10-FLUSH-LABEL: mad_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-STRICT-NEXT: s_nop 0 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3 -; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm @@ -1508,11 +1508,11 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out, define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1524,8 +1524,8 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_mad_f16 v2, -v7, v2, v3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 @@ -1533,11 +1533,11 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1549,8 +1549,8 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, -v7, v2, v3 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 @@ -1558,102 +1558,102 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: mad_sub_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1 -; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-STRICT-NEXT: s_nop 0 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3 -; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm @@ -1677,11 +1677,11 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1693,8 +1693,8 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_mad_f16 v2, v7, v2, -|v3| ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 @@ -1702,11 +1702,11 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1718,8 +1718,8 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, -|v3| ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 @@ -1727,102 +1727,102 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3| -; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-STRICT-NEXT: s_nop 0 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3| -; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm @@ -1847,11 +1847,11 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture % define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1863,8 +1863,8 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_mad_f16 v2, -v7, v2, |v3| ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 @@ -1872,11 +1872,11 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; VI-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -1888,8 +1888,8 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, -v7, v2, |v3| ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 @@ -1897,102 +1897,102 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu ; ; GFX10-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1 -; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-STRICT-NEXT: s_nop 0 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3| -; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm @@ -2017,11 +2017,11 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: neg_neg_mad_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -2033,8 +2033,8 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v7, v2 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v3 @@ -2042,11 +2042,11 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; VI-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -2058,8 +2058,8 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, v2, v3 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 @@ -2067,102 +2067,102 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o ; ; GFX10-FLUSH-LABEL: neg_neg_mad_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: neg_neg_mad_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2 ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1 -; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-STRICT-NEXT: s_nop 0 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2 -; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[4:5] ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm @@ -2188,11 +2188,11 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %out, ptr addrspace(1) noalias nocapture readonly %ptr) #1 { ; VI-FLUSH-LABEL: mad_fabs_sub_f16: ; VI-FLUSH: ; %bb.0: -; VI-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -2204,8 +2204,8 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) ; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-FLUSH-NEXT: s_waitcnt vmcnt(0) -; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1 -; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5 +; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-FLUSH-NEXT: v_mad_f16 v2, v7, |v2|, -v3 ; VI-FLUSH-NEXT: flat_store_short v[0:1], v2 @@ -2213,11 +2213,11 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; VI-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; VI-DENORM-CONTRACT: ; %bb.0: -; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v6, 1, v0 ; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -2229,8 +2229,8 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5] glc ; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1 -; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5 +; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v7, |v2|, -v3 ; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2 @@ -2238,102 +2238,102 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture % ; ; GFX10-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX10-FLUSH: ; %bb.0: -; GFX10-FLUSH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FLUSH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX10-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-FLUSH-NEXT: s_endpgm ; ; GFX10-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-STRICT: ; %bb.0: -; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-STRICT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-STRICT-NEXT: s_endpgm ; ; GFX10-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX10-DENORM-CONTRACT: ; %bb.0: -; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[2:3] offset:4 glc dlc +; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7] offset:4 glc dlc ; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 -; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[4:5] ; GFX10-DENORM-CONTRACT-NEXT: s_endpgm ; ; GFX11-FLUSH-LABEL: mad_fabs_sub_f16: ; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) ; GFX11-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-FLUSH-NEXT: s_nop 0 ; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FLUSH-NEXT: s_endpgm ; ; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-STRICT: ; %bb.0: -; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2| ; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3 -; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-STRICT-NEXT: s_nop 0 ; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-STRICT-NEXT: s_endpgm ; ; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16: ; GFX11-DENORM-CONTRACT: ; %bb.0: -; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[6:7] offset:2 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc +; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7] offset:4 glc dlc ; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0) ; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3 -; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[4:5] ; GFX11-DENORM-CONTRACT-NEXT: s_nop 0 ; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-DENORM-CONTRACT-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index f411a76..ce5bb66 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -124,23 +124,23 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fnearbyint_v2f32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_rndne_f32_e32 v1, s3 -; VI-NEXT: v_rndne_f32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_rndne_f32_e32 v1, s7 +; VI-NEXT: v_rndne_f32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: fnearbyint_v2f32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_rndne_f32_e32 v1, s3 -; GFX11-NEXT: v_rndne_f32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_rndne_f32_e32 v1, s7 +; GFX11-NEXT: v_rndne_f32_e32 v0, s6 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index b5440b9..277dc01 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -2813,15 +2813,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fneg_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitcmp1_b32 s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v2, -v0, 0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %i = select i1 %arg1, float 0.0, float %arg @@ -3161,15 +3161,15 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f16(<2 x half> %ar ; ; VI-LABEL: s_fneg_select_infloop_regression_v2f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s1, 1, s1 -; VI-NEXT: s_cselect_b32 s0, 0, s0 -; VI-NEXT: s_xor_b32 s0, s0, 0x80008000 -; VI-NEXT: s_cmp_eq_u32 s1, 1 -; VI-NEXT: s_cselect_b32 s0, 0, s0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_and_b32 s0, 1, s5 +; VI-NEXT: s_cselect_b32 s1, 0, s4 +; VI-NEXT: s_xor_b32 s1, s1, 0x80008000 +; VI-NEXT: s_cmp_eq_u32 s0, 1 +; VI-NEXT: s_cselect_b32 s0, 0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -3293,15 +3293,15 @@ define amdgpu_kernel void @s_fabs_select_infloop_regression_f32(float %arg, i1 % ; ; VI-LABEL: s_fabs_select_infloop_regression_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitcmp1_b32 s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v2, |v0|, 0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %i = select i1 %arg1, float 0.0, float %arg @@ -3343,15 +3343,15 @@ define amdgpu_kernel void @s_fneg_fabs_select_infloop_regression(float %arg, i1 ; ; VI-LABEL: s_fneg_fabs_select_infloop_regression: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitcmp1_b32 s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_bitcmp1_b32 s5, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; VI-NEXT: v_cndmask_b32_e64 v2, -|v0|, 0, s[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %i = select i1 %arg1, float 0.0, float %arg diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index 4364b32..79f1057 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -447,14 +447,14 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in ; ; GFX11-LABEL: fneg_fabs_v4f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_or_b32 s2, s2, 0x80008000 -; GFX11-NEXT: s_or_b32 s3, s3, 0x80008000 +; GFX11-NEXT: s_or_b32 s0, s6, 0x80008000 +; GFX11-NEXT: s_or_b32 s1, s7, 0x80008000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll index 3c000d4..32033c5 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll @@ -18,12 +18,12 @@ define amdgpu_kernel void @fneg_fabsf_fadd_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fadd_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_sub_f32_e64 v2, s3, |v0| -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_sub_f32_e64 v2, s7, |v0| +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %x) @@ -49,12 +49,12 @@ define amdgpu_kernel void @fneg_fabsf_fmul_f32(ptr addrspace(1) %out, float %x, ; ; VI-LABEL: fneg_fabsf_fmul_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mul_f32_e64 v2, s3, -|v0| -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mul_f32_e64 v2, s7, -|v0| +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %fabs = call float @llvm.fabs.f32(float %x) @@ -213,14 +213,14 @@ define amdgpu_kernel void @fneg_fabsf_v2f32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fneg_fabsf_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bitset1_b32 s3, 31 -; VI-NEXT: s_bitset1_b32 s2, 31 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_or_b32 s0, s7, 0x80000000 +; VI-NEXT: s_or_b32 s1, s6, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in) diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll index d78bdfe..94fc929 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.ll @@ -65,27 +65,27 @@ define amdgpu_kernel void @s_fneg_v2f32(ptr addrspace(1) nocapture %out, <2 x fl ; ; VI-LABEL: s_fneg_v2f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s3, s3, 0x80000000 -; VI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_xor_b32 s0, s7, 0x80000000 +; VI-NEXT: s_xor_b32 s1, s6, 0x80000000 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: s_fneg_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 -; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 +; GFX11-NEXT: s_xor_b32 s0, s6, 0x80000000 +; GFX11-NEXT: s_xor_b32 s1, s7, 0x80000000 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index 9f339af..0095bcd 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -16,17 +16,17 @@ declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %d define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) @@ -36,9 +36,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS @@ -46,9 +46,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2 ; GFX12-GISEL-NEXT: ds_pk_add_bf16 v1, v0 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS @@ -116,19 +116,19 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) @@ -164,19 +164,19 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) { define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) @@ -212,20 +212,20 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) { define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) { ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll index 5761c19..7f87b41 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -24,10 +24,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) { ; ; GFX12-LABEL: flat_atomic_fadd_f32_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 ; GFX12-NEXT: s_endpgm %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data) @@ -190,10 +190,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da ; ; GFX12-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX12-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data) @@ -235,10 +235,10 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da ; ; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: v_mov_b32_e32 v2, s6 ; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX12-NEXT: s_endpgm %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data) @@ -280,10 +280,10 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr ; ; GFX12-LABEL: global_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v1, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -316,18 +316,18 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: ds_pk_add_f16 v0, v1 ; GFX940-NEXT: s_endpgm ; ; GFX12-LABEL: local_atomic_fadd_v2f16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: ds_pk_add_f16 v0, v1 ; GFX12-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0) @@ -359,10 +359,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) { ; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX940: ; %bb.0: -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: ds_pk_add_bf16 v0, v1 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -371,9 +371,9 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, ; ; GFX12-LABEL: local_atomic_fadd_v2bf16_noret: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: ds_pk_add_bf16 v0, v1 ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 18d2e52..ca2fa0f 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -362,29 +362,29 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; ; VI-LABEL: test_isfinite_not_pattern_2: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 -; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0 -; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s6, s6 +; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s7|, v0 +; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 -; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3| +; GFX11-NEXT: v_cmp_o_f32_e64 s0, s6, s6 +; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, |s7| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_and_b32 s0, s0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll index 587340c..2928647 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll @@ -664,51 +664,51 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; GFX1030-NEXT: v_mov_b32_e32 v1, 0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: global_store_dword v1, v0, s[6:7] +; GFX1030-NEXT: global_store_dword v1, v0, s[10:11] ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc +; GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; GFX1100-NEXT: global_store_b32 v1, v0, s[10:11] ; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm ; ; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s0, 4 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX12-NEXT: s_mov_b32 s4, 4 -; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN +; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[4:7], s0 offen th:TH_ATOMIC_NT_RETURN ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v1, v0, s[6:7] +; GFX12-NEXT: global_store_b32 v1, v0, s[10:11] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -743,37 +743,37 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; G_GFX1030-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) -; G_GFX1030-NEXT: global_store_dword v1, v0, s[6:7] +; G_GFX1030-NEXT: global_store_dword v1, v0, s[10:11] ; G_GFX1030-NEXT: s_endpgm ; ; G_GFX1100-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc ; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) -; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; G_GFX1100-NEXT: global_store_b32 v1, v0, s[10:11] ; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll index e3ed0fa..f4745a5 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll @@ -50,22 +50,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen ; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm @@ -408,22 +408,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x1 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32: ; GFX1100: ; %bb.0: ; %main_body ; GFX1100-NEXT: s_clause 0x1 -; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 -; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34 +; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen +; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen ; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm @@ -607,37 +607,37 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; GFX1030-NEXT: v_mov_b32_e32 v1, 0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) -; GFX1030-NEXT: global_store_dword v1, v0, s[6:7] +; GFX1030-NEXT: global_store_dword v1, v0, s[10:11] ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; GFX1100: ; %bb.0: ; %main_body -; GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc +; GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc ; GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; GFX1100-NEXT: s_waitcnt vmcnt(0) -; GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; GFX1100-NEXT: global_store_b32 v1, v0, s[10:11] ; GFX1100-NEXT: s_nop 0 ; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-NEXT: s_endpgm @@ -672,37 +672,37 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f32_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; G_GFX10-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) -; G_GFX10-NEXT: global_store_dword v1, v0, s[6:7] +; G_GFX10-NEXT: global_store_dword v1, v0, s[10:11] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 4 offen glc slc +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc ; G_GFX1030-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) -; G_GFX1030-NEXT: global_store_dword v1, v0, s[6:7] +; G_GFX1030-NEXT: global_store_dword v1, v0, s[10:11] ; G_GFX1030-NEXT: s_endpgm ; ; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_rtn_f32_off4_slc: ; G_GFX1100: ; %bb.0: ; %main_body -; G_GFX1100-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; G_GFX1100-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 -; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 4 offen glc slc +; G_GFX1100-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 4 offen glc slc ; G_GFX1100-NEXT: v_mov_b32_e32 v1, 0 ; G_GFX1100-NEXT: s_waitcnt vmcnt(0) -; G_GFX1100-NEXT: global_store_b32 v1, v0, s[6:7] +; G_GFX1100-NEXT: global_store_b32 v1, v0, s[10:11] ; G_GFX1100-NEXT: s_nop 0 ; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; G_GFX1100-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll index d827ea0..dc1f8ca 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll @@ -28,38 +28,38 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: s_mov_b32 s10, s6 -; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX11-LABEL: test_convert_fp16_to_fp32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_mov_b32 s11, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s0, s4 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll index 03b8251..03ee67d 100644 --- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll +++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll @@ -27,41 +27,41 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp16_to_fp64: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: s_mov_b32 s10, s6 -; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: buffer_load_ushort v0, off, s[8:11], 0 -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX8-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX11-LABEL: test_convert_fp16_to_fp64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_mov_b32 s11, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s0, s4 ; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll index 8ab82b7..4f6ea77 100644 --- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll @@ -27,38 +27,38 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o ; ; GFX8-LABEL: test_convert_fp32_to_fp16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: s_mov_b32 s10, s6 -; GFX8-NEXT: s_mov_b32 s11, s7 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: s_mov_b32 s10, s2 +; GFX8-NEXT: s_mov_b32 s11, s3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0 -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_mov_b32 s1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX8-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX11-LABEL: test_convert_fp32_to_fp16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s10, s2 +; GFX11-NEXT: s_mov_b32 s11, s3 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 +; GFX11-NEXT: s_mov_b32 s8, s6 +; GFX11-NEXT: s_mov_b32 s9, s7 +; GFX11-NEXT: s_mov_b32 s0, s4 ; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 +; GFX11-NEXT: s_mov_b32 s1, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll index d610091..a058c11 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll @@ -452,26 +452,26 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v2, s11 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v2, s7 +; GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; GFX1030-NEXT: v_mov_b32_e32 v2, s10 +; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v2, s11 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b64 v2, v[0:1] ; GFX1030-NEXT: s_endpgm @@ -506,26 +506,26 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inre ; ; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; G_GFX10-NEXT: v_mov_b32_e32 v2, s7 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s10 +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v2, s11 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s7 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s10 +; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s11 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX1030-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll index 5f501fe..046c92a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll @@ -54,14 +54,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: v_mov_b32_e32 v2, s8 +; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen ; GFX1030-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_min_noret_f64: @@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64: ; GFX1030: ; %bb.0: ; %main_body ; GFX1030-NEXT: s_clause 0x2 -; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c -; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen +; GFX1030-NEXT: v_mov_b32_e32 v0, s2 +; GFX1030-NEXT: v_mov_b32_e32 v1, s3 +; GFX1030-NEXT: v_mov_b32_e32 v2, s8 +; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen ; GFX1030-NEXT: s_endpgm ; ; G_SI-LABEL: raw_ptr_buffer_atomic_max_noret_f64: @@ -452,26 +452,26 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX10: ; %bb.0: ; %main_body -; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX10-NEXT: v_mov_b32_e32 v2, s7 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX10-NEXT: v_mov_b32_e32 v2, s11 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ds_write_b64 v2, v[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; GFX1030-NEXT: v_mov_b32_e32 v2, s7 +; GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; GFX1030-NEXT: v_mov_b32_e32 v2, s10 +; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; GFX1030-NEXT: v_mov_b32_e32 v2, s11 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: ds_write_b64 v2, v[0:1] ; GFX1030-NEXT: s_endpgm @@ -506,26 +506,26 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp ; ; G_GFX10-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX10: ; %bb.0: ; %main_body -; G_GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX10-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX10-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; G_GFX10-NEXT: v_mov_b32_e32 v2, s7 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s10 +; G_GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; G_GFX10-NEXT: v_mov_b32_e32 v2, s11 ; G_GFX10-NEXT: s_waitcnt vmcnt(0) ; G_GFX10-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX10-NEXT: s_endpgm ; ; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc: ; G_GFX1030: ; %bb.0: ; %main_body -; G_GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; G_GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4 -; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5 -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6 -; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc -; G_GFX1030-NEXT: v_mov_b32_e32 v2, s7 +; G_GFX1030-NEXT: v_mov_b32_e32 v0, s8 +; G_GFX1030-NEXT: v_mov_b32_e32 v1, s9 +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s10 +; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc +; G_GFX1030-NEXT: v_mov_b32_e32 v2, s11 ; G_GFX1030-NEXT: s_waitcnt vmcnt(0) ; G_GFX1030-NEXT: ds_write_b64 v2, v[0:1] ; G_GFX1030-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll index 04ef30b..b4fee70 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll @@ -100,15 +100,15 @@ define amdgpu_kernel void @fp_to_sint_v2i32(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_i32_f32_e32 v1, s3 -; VI-NEXT: v_cvt_i32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_cvt_i32_f32_e32 v1, s7 +; VI-NEXT: v_cvt_i32_f32_e32 v0, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v2i32: @@ -329,24 +329,24 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; ; VI-LABEL: fp_to_sint_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s8, 0x2f800000 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s3 +; VI-NEXT: v_trunc_f32_e32 v0, s7 ; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8 -; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: v_floor_f32_e32 v1, v1 -; VI-NEXT: s_mov_b32 s0, 0xcf800000 -; VI-NEXT: v_fma_f32 v2, v1, s0, |v0| -; VI-NEXT: v_trunc_f32_e32 v4, s2 +; VI-NEXT: s_mov_b32 s4, 0xcf800000 +; VI-NEXT: v_fma_f32 v2, v1, s4, |v0| +; VI-NEXT: v_trunc_f32_e32 v4, s6 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v2 ; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v1 ; VI-NEXT: v_floor_f32_e32 v3, v3 ; VI-NEXT: v_cvt_u32_f32_e32 v5, v3 -; VI-NEXT: v_fma_f32 v3, v3, s0, |v4| +; VI-NEXT: v_fma_f32 v3, v3, s4, |v4| ; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v6, v3 ; VI-NEXT: v_xor_b32_e32 v2, v2, v0 @@ -357,9 +357,9 @@ define amdgpu_kernel void @fp_to_sint_v2i64(ptr addrspace(1) %out, <2 x float> % ; VI-NEXT: v_xor_b32_e32 v0, v6, v1 ; VI-NEXT: v_xor_b32_e32 v4, v5, v1 ; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_sint_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll index 5abf82a..f8ede1c 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll @@ -60,15 +60,15 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_u32_f32_e32 v1, s3 -; VI-NEXT: v_cvt_u32_f32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: v_cvt_u32_f32_e32 v1, s7 +; VI-NEXT: v_cvt_u32_f32_e32 v0, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i32: @@ -264,26 +264,26 @@ define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(ptr addrspace(1) %out, <2 x ; ; VI-LABEL: fp_to_uint_v2f32_to_v2i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s0, 0xcf800000 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_trunc_f32_e32 v0, s3 -; VI-NEXT: v_trunc_f32_e32 v4, s2 +; VI-NEXT: v_trunc_f32_e32 v0, s7 +; VI-NEXT: v_trunc_f32_e32 v4, s6 ; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4 ; VI-NEXT: v_floor_f32_e32 v5, v1 -; VI-NEXT: s_mov_b32 s2, 0xcf800000 ; VI-NEXT: v_floor_f32_e32 v6, v2 -; VI-NEXT: v_fma_f32 v0, v5, s2, v0 +; VI-NEXT: v_fma_f32 v0, v5, s0, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v2, v0 -; VI-NEXT: v_fma_f32 v0, v6, s2, v4 +; VI-NEXT: v_fma_f32 v0, v6, s0, v4 ; VI-NEXT: v_cvt_u32_f32_e32 v3, v5 ; VI-NEXT: v_cvt_u32_f32_e32 v1, v6 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; EG-LABEL: fp_to_uint_v2f32_to_v2i64: diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 4ea3323..fecf303 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -126,23 +126,23 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshl_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_alignbit_b32 v2, s6, v0, 25 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshl_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 25 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshl_i32_imm: @@ -159,20 +159,20 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshl_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 25 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshl_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s6, s7, 25 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -720,29 +720,29 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; VI-LABEL: orxor2or1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s2, 7 -; VI-NEXT: s_or_b32 s4, s3, s4 -; VI-NEXT: s_cmp_eq_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshl_b32 s0, s6, 7 +; VI-NEXT: s_or_b32 s0, s7, s0 +; VI-NEXT: s_cmp_eq_u32 s0, 0 +; VI-NEXT: s_cselect_b32 s0, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: orxor2or1: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s4, s2, 7 -; GFX9-NEXT: s_or_b32 s4, s3, s4 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_lshl_b32 s0, s6, 7 +; GFX9-NEXT: s_or_b32 s0, s7, s0 +; GFX9-NEXT: s_cmp_eq_u32 s0, 0 +; GFX9-NEXT: s_cselect_b32 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: orxor2or1: @@ -761,29 +761,29 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; ; GFX10-LABEL: orxor2or1: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_lshl_b32 s4, s2, 7 -; GFX10-NEXT: s_or_b32 s4, s3, s4 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_lshl_b32 s0, s6, 7 +; GFX10-NEXT: s_or_b32 s0, s7, s0 +; GFX10-NEXT: s_cmp_eq_u32 s0, 0 +; GFX10-NEXT: s_cselect_b32 s0, s6, s7 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: orxor2or1: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshl_b32 s4, s2, 7 +; GFX11-NEXT: s_lshl_b32 s0, s6, 7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s4, s3, s4 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s2, s2, s3 +; GFX11-NEXT: s_or_b32 s0, s7, s0 +; GFX11-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-NEXT: s_cselect_b32 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index e8310e7..a5ea1ee 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -86,13 +86,13 @@ define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z ; GFX11-LABEL: fshr_i32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x34 -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x34 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, v0 -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v0, s6, s7, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -118,23 +118,23 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; VI-LABEL: fshr_i32_imm: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_alignbit_b32 v2, s2, v0, 7 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_alignbit_b32 v2, s6, v0, 7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: fshr_i32_imm: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 7 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_alignbit_b32 v1, s6, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; R600-LABEL: fshr_i32_imm: @@ -151,20 +151,20 @@ define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX10-LABEL: fshr_i32_imm: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 7 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s7, 7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: fshr_i32_imm: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 7 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s6, s7, 7 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll index dac3a3d..4d585cf 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -4140,27 +4140,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 ; ; VI-LABEL: atomic_cmpxchg_i32_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm @@ -4391,27 +4391,27 @@ define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i3 ; ; VI-LABEL: atomic_cmpxchg_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_cmpxchg_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index 516c92f..3050da03 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -4648,24 +4648,24 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_max_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s3, 31 -; VI-NEXT: s_mov_b32 s4, s3 -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_ashr_i32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s0, s7 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_add_u32 s2, s4, s0 +; VI-NEXT: s_addc_u32 s3, s5, s1 +; VI-NEXT: s_load_dword s4, s[2:3], 0x10 +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB91_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_i32_e32 v2, s2, v3 +; VI-NEXT: v_max_i32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4679,28 +4679,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_max_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB91_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -4861,22 +4861,22 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; VI-LABEL: atomic_max_i32_addr64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s3, 31 -; VI-NEXT: s_mov_b32 s4, s3 -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_ashr_i32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s0, s7 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_add_u32 s2, s4, s0 +; VI-NEXT: s_addc_u32 s3, s5, s1 +; VI-NEXT: s_load_dword s4, s[2:3], 0x0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b64 s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v3, s4 ; VI-NEXT: .LBB93_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_i32_e32 v2, s2, v3 +; VI-NEXT: v_max_i32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -4890,28 +4890,28 @@ define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, ; ; GFX9-LABEL: atomic_max_i32_addr64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB93_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -5901,24 +5901,24 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; VI-LABEL: atomic_umax_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s3, 31 -; VI-NEXT: s_mov_b32 s4, s3 -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_ashr_i32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s0, s7 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_add_u32 s2, s4, s0 +; VI-NEXT: s_addc_u32 s3, s5, s1 +; VI-NEXT: s_load_dword s4, s[2:3], 0x10 +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB105_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_max_u32_e32 v2, s2, v3 +; VI-NEXT: v_max_u32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -5932,28 +5932,28 @@ define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, ; ; GFX9-LABEL: atomic_umax_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_max_u32_e32 v0, s2, v1 +; GFX9-NEXT: v_max_u32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB105_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm @@ -7892,24 +7892,24 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; VI-LABEL: atomic_min_i32_addr64_offset: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_ashr_i32 s5, s3, 31 -; VI-NEXT: s_mov_b32 s4, s3 -; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; VI-NEXT: s_add_u32 s4, s0, s4 -; VI-NEXT: s_addc_u32 s5, s1, s5 -; VI-NEXT: s_load_dword s3, s[4:5], 0x10 -; VI-NEXT: s_add_u32 s4, s4, 16 -; VI-NEXT: s_addc_u32 s5, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_ashr_i32 s1, s7, 31 +; VI-NEXT: s_mov_b32 s0, s7 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; VI-NEXT: s_add_u32 s2, s4, s0 +; VI-NEXT: s_addc_u32 s3, s5, s1 +; VI-NEXT: s_load_dword s4, s[2:3], 0x10 +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: .LBB128_1: ; %atomicrmw.start ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_min_i32_e32 v2, s2, v3 +; VI-NEXT: v_min_i32_e32 v2, s6, v3 ; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol @@ -7923,28 +7923,28 @@ define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i ; ; GFX9-LABEL: atomic_min_i32_addr64_offset: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i32 s5, s3, 31 -; GFX9-NEXT: s_mov_b32 s4, s3 -; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GFX9-NEXT: s_add_u32 s0, s0, s4 -; GFX9-NEXT: s_addc_u32 s1, s1, s5 -; GFX9-NEXT: s_load_dword s3, s[0:1], 0x10 -; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: s_ashr_i32 s1, s7, 31 +; GFX9-NEXT: s_mov_b32 s0, s7 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s4 ; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_min_i32_e32 v0, s2, v1 +; GFX9-NEXT: v_min_i32_e32 v0, s6, v1 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_cbranch_execnz .LBB128_1 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll index e2d5599..adb803a 100644 --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -100,13 +100,13 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg ; ; GFX11-LABEL: load_v3f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: v_mov_b32_e32 v2, s6 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] offset:4 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] +; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] offset:4 +; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -130,11 +130,11 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg ; ; GFX11-LABEL: load_v4f16_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -349,14 +349,14 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 -; GFX11-NEXT: global_store_b96 v3, v[0:2], s[0:1] +; GFX11-NEXT: s_lshr_b32 s0, s6, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s7 +; GFX11-NEXT: global_store_b96 v3, v[0:2], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -398,16 +398,16 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f32_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s3, 16 -; GFX11-NEXT: s_lshr_b32 s5, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 -; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1] +; GFX11-NEXT: s_lshr_b32 s0, s7, 16 +; GFX11-NEXT: s_lshr_b32 s1, s6, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s7 +; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -649,12 +649,12 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; ; GFX11-LABEL: extload_v3f16_to_v3f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s4 -; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s2 +; GFX11-NEXT: s_lshr_b32 s0, s6, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v6, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v1 @@ -662,8 +662,8 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v6 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v6, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -725,14 +725,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; ; GFX11-LABEL: extload_v4f16_to_v4f64_arg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_lshr_b32 s5, s3, 16 -; GFX11-NEXT: s_lshr_b32 s4, s2, 16 -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s5 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v8, s4 +; GFX11-NEXT: s_lshr_b32 s1, s7, 16 +; GFX11-NEXT: s_lshr_b32 s0, s6, 16 +; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s7 +; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s1 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s6 +; GFX11-NEXT: v_cvt_f32_f16_e32 v8, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v2 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 @@ -741,8 +741,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4 ; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; GFX11-NEXT: v_mov_b32_e32 v8, 0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 -; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2693,11 +2693,11 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x ; ; GFX11-LABEL: fadd_v2f16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v1, s2, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_pk_add_f16 v1, s6, s7 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index f736ca7..18d5c05 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -85,19 +85,19 @@ entry: define amdgpu_kernel void @float2_inselt(ptr addrspace(1) %out, <2 x float> %vec, i32 %sel) { ; GCN-LABEL: float2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: s_cmp_lg_u32 s2, 1 +; GCN-NEXT: v_mov_b32_e32 v0, s7 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_cselect_b64 vcc, -1, 0 -; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm entry: @@ -289,20 +289,20 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s0, 0x3c003c00 +; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], s[2:3] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -314,16 +314,16 @@ entry: define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, i32 %sel) { ; GCN-LABEL: half2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s3, s3, 4 -; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GCN-NEXT: s_andn2_b32 s2, s2, s3 -; GCN-NEXT: s_and_b32 s3, s3, 0x3c003c00 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s0, s7, 4 +; GCN-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GCN-NEXT: s_andn2_b32 s1, s6, s0 +; GCN-NEXT: s_and_b32 s0, s0, 0x3c003c00 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -397,16 +397,16 @@ entry: define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, i32 %sel) { ; GCN-LABEL: short2_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s3, s3, 4 -; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GCN-NEXT: s_andn2_b32 s2, s2, s3 -; GCN-NEXT: s_and_b32 s3, s3, 0x10001 -; GCN-NEXT: s_or_b32 s2, s3, s2 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_lshl_b32 s0, s7, 4 +; GCN-NEXT: s_lshl_b32 s0, 0xffff, s0 +; GCN-NEXT: s_andn2_b32 s1, s6, s0 +; GCN-NEXT: s_and_b32 s0, s0, 0x10001 +; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dword v[0:1], v2 ; GCN-NEXT: s_endpgm entry: @@ -418,20 +418,20 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_mov_b32 s4, 0x10001 -; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s0, 0x10001 +; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 4 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: s_lshl_b32 s2, s2, 4 +; GCN-NEXT: s_lshl_b64 s[2:3], 0xffff, s2 +; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], s[2:3] +; GCN-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7] +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: @@ -443,19 +443,19 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s4, s[0:1], 0x34 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s4, 3 -; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 -; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 -; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: s_lshl_b32 s0, s2, 3 +; GCN-NEXT: s_lshl_b64 s[0:1], 0xff, s0 +; GCN-NEXT: s_and_b32 s3, s1, 0x1010101 +; GCN-NEXT: s_and_b32 s2, s0, 0x1010101 +; GCN-NEXT: s_andn2_b64 s[0:1], s[6:7], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s5 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm entry: @@ -962,34 +962,34 @@ entry: define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: s_mov_b32 s7, 0xe80000 -; GCN-NEXT: s_add_u32 s4, s4, s3 -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN-NEXT: s_addc_u32 s5, s5, 0 +; GCN-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s11, 0xe80000 +; GCN-NEXT: s_add_u32 s8, s8, s3 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: v_lshrrev_b16_e64 v2, 1, s2 -; GCN-NEXT: v_lshrrev_b16_e64 v3, 2, s2 -; GCN-NEXT: v_lshrrev_b16_e64 v4, 3, s2 -; GCN-NEXT: v_or_b32_e32 v0, s3, v0 +; GCN-NEXT: s_and_b32 s0, s7, 3 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v2, 1, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v3, 2, s6 +; GCN-NEXT: v_lshrrev_b16_e64 v4, 3, s6 +; GCN-NEXT: v_or_b32_e32 v0, s0, v0 ; GCN-NEXT: v_and_b32_e32 v2, 1, v2 ; GCN-NEXT: v_and_b32_e32 v3, 3, v3 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: buffer_store_byte v1, off, s[4:7], 0 -; GCN-NEXT: buffer_store_byte v4, off, s[4:7], 0 offset:3 -; GCN-NEXT: buffer_store_byte v3, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1 +; GCN-NEXT: buffer_store_byte v1, off, s[8:11], 0 +; GCN-NEXT: buffer_store_byte v4, off, s[8:11], 0 offset:3 +; GCN-NEXT: buffer_store_byte v3, off, s[8:11], 0 offset:2 +; GCN-NEXT: buffer_store_byte v2, off, s[8:11], 0 offset:1 ; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: buffer_store_byte v1, v0, s[4:7], 0 offen -; GCN-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:1 -; GCN-NEXT: buffer_load_ubyte v2, off, s[4:7], 0 offset:2 -; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:3 +; GCN-NEXT: buffer_store_byte v1, v0, s[8:11], 0 offen +; GCN-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 +; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:1 +; GCN-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2 +; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:3 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_waitcnt vmcnt(2) @@ -1004,8 +1004,8 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: v_or_b32_e32 v0, v0, v3 ; GCN-NEXT: v_and_b32_e32 v2, 15, v0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: flat_store_byte v[0:1], v2 ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index df03e89..1313460 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -436,193 +436,193 @@ entry: define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) { ; GFX9-LABEL: udiv_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-NEXT: s_add_i32 s5, s5, s4 -; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-NEXT: s_add_i32 s6, s4, 1 -; GFX9-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-NEXT: s_add_i32 s5, s4, 1 -; GFX9-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-NEXT: s_add_i32 s1, s1, s0 +; GFX9-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-NEXT: s_add_i32 s2, s0, 1 +; GFX9-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX90A-LABEL: udiv_i32: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 -; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX90A-NEXT: s_sub_i32 s4, 0, s3 +; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX90A-NEXT: s_sub_i32 s0, 0, s7 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX90A-NEXT: v_readfirstlane_b32 s5, v0 -; GFX90A-NEXT: s_mul_i32 s4, s4, s5 -; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX90A-NEXT: s_add_i32 s5, s5, s4 -; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX90A-NEXT: s_mul_i32 s5, s4, s3 -; GFX90A-NEXT: s_sub_i32 s2, s2, s5 -; GFX90A-NEXT: s_add_i32 s6, s4, 1 -; GFX90A-NEXT: s_sub_i32 s5, s2, s3 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s4, s6, s4 -; GFX90A-NEXT: s_cselect_b32 s2, s5, s2 -; GFX90A-NEXT: s_add_i32 s5, s4, 1 -; GFX90A-NEXT: s_cmp_ge_u32 s2, s3 -; GFX90A-NEXT: s_cselect_b32 s2, s5, s4 -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] +; GFX90A-NEXT: v_readfirstlane_b32 s1, v0 +; GFX90A-NEXT: s_mul_i32 s0, s0, s1 +; GFX90A-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX90A-NEXT: s_add_i32 s1, s1, s0 +; GFX90A-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX90A-NEXT: s_mul_i32 s1, s0, s7 +; GFX90A-NEXT: s_sub_i32 s1, s6, s1 +; GFX90A-NEXT: s_add_i32 s2, s0, 1 +; GFX90A-NEXT: s_sub_i32 s3, s1, s7 +; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 +; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 +; GFX90A-NEXT: s_cselect_b32 s1, s3, s1 +; GFX90A-NEXT: s_add_i32 s2, s0, 1 +; GFX90A-NEXT: s_cmp_ge_u32 s1, s7 +; GFX90A-NEXT: s_cselect_b32 s0, s2, s0 +; GFX90A-NEXT: v_mov_b32_e32 v0, s0 +; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: udiv_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX10-NEXT: s_sub_i32 s5, 0, s3 +; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX10-NEXT: s_sub_i32 s1, 0, s7 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX10-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_mul_i32 s5, s5, s4 -; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5 -; GFX10-NEXT: s_add_i32 s4, s4, s5 -; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4 -; GFX10-NEXT: s_mul_i32 s5, s4, s3 -; GFX10-NEXT: s_sub_i32 s2, s2, s5 -; GFX10-NEXT: s_add_i32 s5, s4, 1 -; GFX10-NEXT: s_sub_i32 s6, s2, s3 -; GFX10-NEXT: s_cmp_ge_u32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s4, s5, s4 -; GFX10-NEXT: s_cselect_b32 s2, s6, s2 -; GFX10-NEXT: s_add_i32 s5, s4, 1 -; GFX10-NEXT: s_cmp_ge_u32 s2, s3 -; GFX10-NEXT: s_cselect_b32 s2, s5, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_mul_i32 s1, s1, s0 +; GFX10-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX10-NEXT: s_add_i32 s0, s0, s1 +; GFX10-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX10-NEXT: s_mul_i32 s1, s0, s7 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_sub_i32 s1, s6, s1 +; GFX10-NEXT: s_sub_i32 s3, s1, s7 +; GFX10-NEXT: s_cmp_ge_u32 s1, s7 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: s_cselect_b32 s1, s3, s1 +; GFX10-NEXT: s_add_i32 s2, s0, 1 +; GFX10-NEXT: s_cmp_ge_u32 s1, s7 +; GFX10-NEXT: s_cselect_b32 s0, s2, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm ; ; GFX9-FLATSCR-LABEL: udiv_i32: ; GFX9-FLATSCR: ; %bb.0: -; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3 +; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX9-FLATSCR-NEXT: s_sub_i32 s0, 0, s7 ; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0 -; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4 -; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4 -; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5 -; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3 -; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5 -; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1 -; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2 -; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1 -; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3 -; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4 -; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s1, v0 +; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s1, s0 +; GFX9-FLATSCR-NEXT: s_add_i32 s1, s1, s0 +; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s0, s6, s1 +; GFX9-FLATSCR-NEXT: s_mul_i32 s1, s0, s7 +; GFX9-FLATSCR-NEXT: s_sub_i32 s1, s6, s1 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 +; GFX9-FLATSCR-NEXT: s_sub_i32 s3, s1, s7 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s1, s3, s1 +; GFX9-FLATSCR-NEXT: s_add_i32 s2, s0, 1 +; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s1, s7 +; GFX9-FLATSCR-NEXT: s_cselect_b32 s0, s2, s0 +; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[4:5] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_endpgm ; ; GFX11-LABEL: udiv_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3 -; GFX11-NEXT: s_sub_i32 s5, 0, s3 +; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s7 +; GFX11-NEXT: s_sub_i32 s1, 0, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_mul_i32 s5, s5, s4 +; GFX11-NEXT: s_mul_i32 s1, s1, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s5, s4, s5 -; GFX11-NEXT: s_add_i32 s4, s4, s5 +; GFX11-NEXT: s_mul_hi_u32 s1, s0, s1 +; GFX11-NEXT: s_add_i32 s0, s0, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_hi_u32 s4, s2, s4 -; GFX11-NEXT: s_mul_i32 s5, s4, s3 +; GFX11-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX11-NEXT: s_mul_i32 s1, s0, s7 +; GFX11-NEXT: s_add_i32 s2, s0, 1 +; GFX11-NEXT: s_sub_i32 s1, s6, s1 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_sub_i32 s2, s2, s5 -; GFX11-NEXT: s_add_i32 s5, s4, 1 -; GFX11-NEXT: s_sub_i32 s6, s2, s3 -; GFX11-NEXT: s_cmp_ge_u32 s2, s3 -; GFX11-NEXT: s_cselect_b32 s4, s5, s4 -; GFX11-NEXT: s_cselect_b32 s2, s6, s2 -; GFX11-NEXT: s_add_i32 s5, s4, 1 -; GFX11-NEXT: s_cmp_ge_u32 s2, s3 -; GFX11-NEXT: s_cselect_b32 s2, s5, s4 +; GFX11-NEXT: s_sub_i32 s3, s1, s7 +; GFX11-NEXT: s_cmp_ge_u32 s1, s7 +; GFX11-NEXT: s_cselect_b32 s0, s2, s0 +; GFX11-NEXT: s_cselect_b32 s1, s3, s1 +; GFX11-NEXT: s_add_i32 s2, s0, 1 +; GFX11-NEXT: s_cmp_ge_u32 s1, s7 +; GFX11-NEXT: s_cselect_b32 s0, s2, s0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: udiv_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_cvt_f32_u32 s4, s3 -; GFX12-NEXT: s_sub_co_i32 s5, 0, s3 +; GFX12-NEXT: s_cvt_f32_u32 s0, s7 +; GFX12-NEXT: s_sub_co_i32 s1, 0, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4 -; GFX12-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_3) -; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe -; GFX12-NEXT: s_cvt_u32_f32 s4, s4 +; GFX12-NEXT: s_mul_f32 s0, s0, 0x4f7ffffe +; GFX12-NEXT: s_cvt_u32_f32 s0, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s5, s5, s4 -; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_add_co_i32 s4, s4, s5 -; GFX12-NEXT: s_mul_hi_u32 s4, s2, s4 +; GFX12-NEXT: s_mul_i32 s1, s1, s0 +; GFX12-NEXT: s_mul_hi_u32 s1, s0, s1 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_i32 s5, s4, s3 -; GFX12-NEXT: s_sub_co_i32 s2, s2, s5 -; GFX12-NEXT: s_add_co_i32 s5, s4, 1 -; GFX12-NEXT: s_sub_co_i32 s6, s2, s3 -; GFX12-NEXT: s_cmp_ge_u32 s2, s3 -; GFX12-NEXT: s_cselect_b32 s4, s5, s4 -; GFX12-NEXT: s_cselect_b32 s2, s6, s2 -; GFX12-NEXT: s_add_co_i32 s5, s4, 1 -; GFX12-NEXT: s_cmp_ge_u32 s2, s3 -; GFX12-NEXT: s_cselect_b32 s2, s5, s4 +; GFX12-NEXT: s_add_co_i32 s0, s0, s1 +; GFX12-NEXT: s_mul_hi_u32 s0, s6, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_i32 s1, s0, s7 +; GFX12-NEXT: s_add_co_i32 s2, s0, 1 +; GFX12-NEXT: s_sub_co_i32 s1, s6, s1 +; GFX12-NEXT: s_sub_co_i32 s3, s1, s7 +; GFX12-NEXT: s_cmp_ge_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s0, s2, s0 +; GFX12-NEXT: s_cselect_b32 s1, s3, s1 +; GFX12-NEXT: s_add_co_i32 s2, s0, 1 +; GFX12-NEXT: s_cmp_ge_u32 s1, s7 +; GFX12-NEXT: s_cselect_b32 s0, s2, s0 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm %r = udiv i32 %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 69f181f..a344128 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -731,12 +731,12 @@ define amdgpu_kernel void @v2i32_arg(ptr addrspace(1) nocapture %out, <2 x i32> ; ; VI-LABEL: v2i32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -794,12 +794,12 @@ define amdgpu_kernel void @v2f32_arg(ptr addrspace(1) nocapture %out, <2 x float ; ; VI-LABEL: v2f32_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -998,16 +998,16 @@ define amdgpu_kernel void @v3i16_arg(ptr addrspace(1) nocapture %out, <3 x i16> ; ; VI-LABEL: v3i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 4 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: s_add_u32 s0, s4, 4 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm @@ -1332,12 +1332,12 @@ define amdgpu_kernel void @v4i16_arg(ptr addrspace(1) %out, <4 x i16> %in) { ; ; VI-LABEL: v4i16_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -1545,16 +1545,16 @@ define amdgpu_kernel void @v5i8_arg(ptr addrspace(1) nocapture %out, <5 x i8> %i ; ; VI-LABEL: v5i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s4, s0, 4 -; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v5, s2 +; VI-NEXT: s_add_u32 s0, s4, 4 +; VI-NEXT: s_addc_u32 s1, s5, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: flat_store_byte v[2:3], v4 ; VI-NEXT: flat_store_dword v[0:1], v5 ; VI-NEXT: s_endpgm @@ -2397,12 +2397,12 @@ define amdgpu_kernel void @v8i8_arg(ptr addrspace(1) %out, <8 x i8> %in) { ; ; VI-LABEL: v8i8_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -4539,19 +4539,19 @@ define amdgpu_kernel void @i65_arg(ptr addrspace(1) nocapture %out, i65 %in) nou ; ; VI-LABEL: i65_arg: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dword s4, s[0:1], 0x34 -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s2, s[0:1], 0x34 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s4, 1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_add_u32 s0, s0, 8 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_and_b32 s1, s2, 1 +; VI-NEXT: s_add_u32 s0, s4, 8 +; VI-NEXT: v_mov_b32_e32 v6, s1 +; VI-NEXT: s_addc_u32 s1, s5, 0 ; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v6, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: flat_store_byte v[4:5], v6 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -5565,18 +5565,18 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; ; VI-LABEL: array_3xi32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: flat_store_short v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s5 ; VI-NEXT: flat_store_dword v[0:1], v0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index 920ff8a..01a1ab4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -21,41 +21,41 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, float %x ; ; VI-LABEL: s_cvt_pkrtz_v2f16_f32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, v0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s6, v0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s2, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, s6, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_cvt_pkrtz_v2f16_f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s6, s7 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll index 17b941c..a737c5e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll @@ -15,47 +15,47 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1) @@ -66,47 +66,47 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; SDAG-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| +; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; SDAG-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; SDAG-GFX10: ; %bb.0: -; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| -; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2 -; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| +; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0 +; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX10-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| +; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| ; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm ; ; GISEL-GFX10-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GISEL-GFX10: ; %bb.0: -; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s2, |s2|, |s3| -; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX10-NEXT: v_cmp_eq_f32_e64 s0, |s6|, |s7| +; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX10-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %src_input = call float @llvm.fabs.f32(float %src) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll index ce055d6..7d41cf1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll @@ -16,53 +16,53 @@ declare half @llvm.fabs.f16(half) #0 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| +; GFX11-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_oeq_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s7 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0| +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |v0| -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s7 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |v0| +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) @@ -74,53 +74,53 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(ptr addrspace(1) %out, float define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(ptr addrspace(1) %out, float %src, float %a) { ; GFX11-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |s3| +; GFX11-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |s7| ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX9-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0| +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-SDAG-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3 -; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| -; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2 -; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3 +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s7 +; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0| +; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 ; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: v_fcmp_f32_oeq_both_operands_with_fabs: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], |s2|, |v0| -; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 -; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s7 +; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[0:1], |s6|, |v0| +; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 +; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll index d7dd0ce..4a66b76 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll @@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32) ; GCN-LABEL: {{^}}global_atomic_csub_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) { main_body: %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) @@ -15,7 +15,7 @@ main_body: ; GCN-LABEL: {{^}}global_atomic_csub_no_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { main_body: %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data) @@ -24,7 +24,7 @@ main_body: ; GCN-LABEL: {{^}}global_atomic_csub_off4_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 glc -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4 th:TH_ATOMIC_RETURN +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 th:TH_ATOMIC_RETURN define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) { main_body: %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 @@ -34,7 +34,7 @@ main_body: ; GCN-LABEL: {{^}}global_atomic_csub_off4_no_rtn: ; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 -; GFX12PLUS: global_atomic_sub_clamp_u32 v0, v1, s[0:1] offset:4 +; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4 define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 { main_body: %p = getelementptr i32, ptr addrspace(1) %ptr, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 309fd99..ae61b58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -1743,32 +1743,32 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_gt_u32 s2, 1 -; GFX11-NEXT: s_cselect_b32 s2, -1, 0 -; GFX11-NEXT: s_cmp_gt_u32 s3, 2 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 +; GFX11-NEXT: s_cmp_gt_u32 s6, 1 +; GFX11-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-NEXT: s_cmp_gt_u32 s7, 2 +; GFX11-NEXT: s_cselect_b32 s1, -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_and_b32 s0, s0, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_gt_u32 s2, 1 -; GFX10-NEXT: s_cselect_b32 s2, -1, 0 -; GFX10-NEXT: s_cmp_gt_u32 s3, 2 -; GFX10-NEXT: s_cselect_b32 s3, -1, 0 -; GFX10-NEXT: s_and_b32 s2, s2, s3 -; GFX10-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_cmp_gt_u32 s6, 1 +; GFX10-NEXT: s_cselect_b32 s0, -1, 0 +; GFX10-NEXT: s_cmp_gt_u32 s7, 2 +; GFX10-NEXT: s_cselect_b32 s1, -1, 0 +; GFX10-NEXT: s_and_b32 s0, s0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 5f979e0..54931ac 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -1952,51 +1952,51 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) { define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX11-LABEL: v_icmp_i1_ne0: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_gt_u32 s2, 1 -; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX11-NEXT: s_cmp_gt_u32 s3, 2 +; GFX11-NEXT: s_cmp_gt_u32 s6, 1 +; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX11-NEXT: s_cmp_gt_u32 s7, 2 ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm ; ; VI-LABEL: v_icmp_i1_ne0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_gt_u32 s2, 1 -; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; VI-NEXT: s_cmp_gt_u32 s3, 2 +; VI-NEXT: s_cmp_gt_u32 s6, 1 +; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; VI-NEXT: s_cmp_gt_u32 s7, 2 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: v_icmp_i1_ne0: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_cmp_gt_u32 s2, 1 -; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX9-NEXT: s_cmp_gt_u32 s3, 2 +; GFX9-NEXT: s_cmp_gt_u32 s6, 1 +; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX9-NEXT: s_cmp_gt_u32 s7, 2 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %c0 = icmp ugt i32 %a, 1 %c1 = icmp ugt i32 %b, 2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll index dba67a0..b070602 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.ll @@ -14,12 +14,12 @@ entry: define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000000) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: v_add_u32_e32 v2, 0x6000, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v2 offset:57456 ; GCN-NEXT: ds_read_b128 a[24:27], v2 offset:57440 @@ -44,7 +44,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_read_b128 a[152:155], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[68:71], v1 offset:24592 ; GCN-NEXT: ds_read_b128 a[64:67], v1 offset:24576 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(4) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v2, v3, a[32:63] ; GCN-NEXT: ds_read_b128 a[148:151], v1 offset:80 @@ -80,7 +80,7 @@ define amdgpu_kernel void @test_iglp_opt_mfma_gemm(ptr addrspace(3) noalias %in, ; GCN-NEXT: ds_write_b128 v0, a[136:139] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[132:135] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[128:131] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: s_waitcnt lgkmcnt(8) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v2, v3, a[64:95] ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:24672 @@ -151,13 +151,13 @@ entry: define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_iglp_opt_rev_mfma_gemm: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: ; iglp_opt mask(0x00000001) ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -176,7 +176,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_read_b128 a[136:139], v1 offset:8224 ; GCN-NEXT: ds_read_b128 a[132:135], v1 offset:8208 ; GCN-NEXT: ds_read_b128 a[128:131], v1 offset:8192 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[128:159], v2, v3, a[128:159] ; GCN-NEXT: ds_read_b128 a[124:127], v1 offset:24688 @@ -218,7 +218,7 @@ define amdgpu_kernel void @test_iglp_opt_rev_mfma_gemm(ptr addrspace(3) noalias ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: ds_write_b128 v0, a[152:155] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[156:159] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[144:147] offset:8256 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll index 8d0397c..3a77b3b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -163,7 +163,7 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 @@ -171,10 +171,10 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 ; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s0, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s2, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s4, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s5, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s6, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s7, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -182,14 +182,14 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 ; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 @@ -199,33 +199,33 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX1030-NEXT: v_add_co_u32 v0, s0, s4, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s0 +; GFX1030-NEXT: v_add_co_u32 v2, s0, s6, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[4:7] +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:10], s[8:11] ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_lshlrev_b32 v2, 2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 +; GFX11-NEXT: v_add_co_u32 v0, s0, s4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, s6, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000 @@ -233,7 +233,7 @@ define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(ptr %p_node_ptr, ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[4:7] +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[6:8], v[3:5], v[0:2]], s[8:11] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm @@ -260,15 +260,15 @@ main_body: define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ptr, ptr %p_ray, <4 x i32> inreg %tdescr) { ; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1013: ; %bb.0: ; %main_body -; GFX1013-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1013-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1013-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1013-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1013-NEXT: s_waitcnt lgkmcnt(0) -; GFX1013-NEXT: v_add_co_u32 v2, s0, s0, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s1, 0, s0 -; GFX1013-NEXT: v_add_co_u32 v4, s0, s2, v0 -; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v2, s0, s4, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v3, s0, s5, 0, s0 +; GFX1013-NEXT: v_add_co_u32 v4, s0, s6, v0 +; GFX1013-NEXT: v_add_co_ci_u32_e64 v5, s0, s7, 0, s0 ; GFX1013-NEXT: flat_load_dword v0, v[2:3] ; GFX1013-NEXT: flat_load_dword v1, v[4:5] ; GFX1013-NEXT: v_mov_b32_e32 v2, 0 @@ -276,53 +276,53 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_ ; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1013-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 ; GFX1013-NEXT: s_waitcnt vmcnt(0) ; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1013-NEXT: s_endpgm ; ; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX1030: ; %bb.0: ; %main_body -; GFX1030-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GFX1030-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 ; GFX1030-NEXT: v_mov_b32_e32 v5, 0x44004200 ; GFX1030-NEXT: v_mov_b32_e32 v6, 0x46004500 ; GFX1030-NEXT: v_mov_b32_e32 v7, 0x48004700 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) -; GFX1030-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX1030-NEXT: v_add_co_u32 v2, s0, s2, v2 -; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX1030-NEXT: v_add_co_u32 v0, s0, s4, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s0 +; GFX1030-NEXT: v_add_co_u32 v2, s0, s6, v2 +; GFX1030-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s0 ; GFX1030-NEXT: flat_load_dword v0, v[0:1] ; GFX1030-NEXT: flat_load_dword v1, v[2:3] ; GFX1030-NEXT: v_mov_b32_e32 v2, 0 ; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 ; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[4:7] a16 +; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 ; GFX1030-NEXT: s_waitcnt vmcnt(0) ; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] ; GFX1030-NEXT: s_endpgm ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 +; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v2 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, s2, v2 +; GFX11-NEXT: v_add_co_u32 v0, s0, s4, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, s6, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[4:7] a16 +; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[8:11] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 265d64f..634159a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -85,12 +85,12 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-LABEL: v_permlane16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlane16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -131,13 +131,13 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-LABEL: v_permlane16_b32_vll: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-NEXT: s_movk_i32 s0, 0x1234 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -203,35 +203,35 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -245,76 +245,76 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0 define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vvs: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s7 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vvs: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s7 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -325,84 +325,72 @@ define amdgpu_kernel void @v_permlane16_b32_vvs(ptr addrspace(1) %out, i32 %src0 } define amdgpu_kernel void @v_permlane16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlane16_b32_vsv: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlane16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlane16_b32_vsv: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX10-LABEL: v_permlane16_b32_vsv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlane16_b32_vsv: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s7, s0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlane16_b32_vsv: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlane16_b32_vsv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v1, s7, s0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16_b32_vsv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-GISEL-NEXT: v_permlane16_b32 v0, v0, s7, s0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -621,12 +609,12 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src ; ; GFX12-LABEL: v_permlanex16_b32_vii: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_permlanex16_b32 v0, v0, 1, 2 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -667,13 +655,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src ; ; GFX12-LABEL: v_permlanex16_b32_vll: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-NEXT: s_movk_i32 s0, 0x1234 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 -; GFX12-NEXT: s_movk_i32 s2, 0x1234 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1 -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1 +; GFX12-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -739,35 +727,35 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s3, v1 -; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s1, v0 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v1 +; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s1, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s3, v1 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -781,76 +769,76 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src0, i32 %src2) { ; GFX10-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-SDAG-NEXT: s_mov_b32 null, 0 +; GFX10-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s6 +; GFX10-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-SDAG-NEXT: s_endpgm ; ; GFX10-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-GISEL-NEXT: s_mov_b32 null, 0 +; GFX10-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-GISEL-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vvs: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s2, s3 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s7 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vvs: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s4, s3 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s7 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -861,84 +849,72 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs(ptr addrspace(1) %out, i32 %src } define amdgpu_kernel void @v_permlanex16_b32_vsv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { -; GFX10-SDAG-LABEL: v_permlanex16_b32_vsv: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-SDAG-NEXT: v_readfirstlane_b32 s2, v1 -; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SDAG-NEXT: v_permlanex16_b32 v0, v0, s3, s2 -; GFX10-SDAG-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_permlanex16_b32_vsv: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-GISEL-NEXT: v_readfirstlane_b32 s4, v1 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX10-LABEL: v_permlanex16_b32_vsv: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 null, 0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] +; GFX10-NEXT: s_endpgm ; ; GFX11-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s7, s0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-SDAG-NEXT: s_nop 0 ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm ; ; GFX11-GISEL-LABEL: v_permlanex16_b32_vsv: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-GISEL-NEXT: s_nop 0 ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm ; ; GFX12-SDAG-LABEL: v_permlanex16_b32_vsv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10 +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_readfirstlane_b32 s2, v0 -; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2 -; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6 +; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v1, s7, s0 +; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16_b32_vsv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GFX12-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s6 +; GFX12-GISEL-NEXT: v_permlanex16_b32 v0, v0, s7, s0 +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll index 9736782..77a975f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll @@ -10,26 +10,26 @@ declare i32 @llvm.amdgcn.workitem.id.y() define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -41,26 +41,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -72,26 +72,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vi(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -103,25 +103,25 @@ define amdgpu_kernel void @v_permlane16var_b32_vl(ptr addrspace(1) %out, i32 %sr define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v1, v0 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -134,26 +134,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vvv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -165,26 +165,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -196,26 +196,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlane16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -227,26 +227,26 @@ define amdgpu_kernel void @v_permlane16var_b32_vv_fi_bc(ptr addrspace(1) %out, i define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -258,26 +258,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -289,26 +289,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vi(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vl: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0xc1d1 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vl: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s2 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0xc1d1 :: v_dual_mov_b32 v1, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -320,25 +320,25 @@ define amdgpu_kernel void @v_permlanex16var_b32_vl(ptr addrspace(1) %out, i32 %s define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 %src0) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vvv: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s6 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v1, v0 -; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vvv: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2 +; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s6 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v1, v0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -351,26 +351,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vvv(ptr addrspace(1) %out, i32 % define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -382,26 +382,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_fi(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm @@ -413,26 +413,26 @@ define amdgpu_kernel void @v_permlanex16var_b32_vv_bc(ptr addrspace(1) %out, i32 define amdgpu_kernel void @v_permlanex16var_b32_vv_fi_bc(ptr addrspace(1) %out, i32 %src0, i32 %src1) { ; GFX12-SDAG-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-SDAG: ; %bb.0: -; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] -; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[4:5] ; GFX12-SDAG-NEXT: s_nop 0 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-SDAG-NEXT: s_endpgm ; ; GFX12-GISEL-LABEL: v_permlanex16var_b32_vv_fi_bc: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0 -; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1] ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX12-GISEL-NEXT: s_nop 0 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index 47c0217..bb3418c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -91,58 +91,58 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; ; VARIANT4-LABEL: test_barrier: ; VARIANT4: ; %bb.0: ; %entry -; VARIANT4-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT4-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; VARIANT4-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT4-NEXT: s_wait_kmcnt 0x0 -; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s2 -; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT4-NEXT: v_xad_u32 v1, v0, -1, s6 +; VARIANT4-NEXT: global_store_b32 v3, v0, s[4:5] ; VARIANT4-NEXT: s_wait_storecnt 0x0 ; VARIANT4-NEXT: s_barrier_signal -1 ; VARIANT4-NEXT: s_barrier_wait -1 ; VARIANT4-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; VARIANT4-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT4-NEXT: v_add_co_u32 v1, vcc_lo, s4, v1 ; VARIANT4-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT4-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s5, v2, vcc_lo ; VARIANT4-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT4-NEXT: s_wait_loadcnt 0x0 -; VARIANT4-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT4-NEXT: global_store_b32 v3, v0, s[4:5] ; VARIANT4-NEXT: s_nop 0 ; VARIANT4-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; VARIANT4-NEXT: s_endpgm ; ; VARIANT5-LABEL: test_barrier: ; VARIANT5: ; %bb.0: ; %entry -; VARIANT5-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT5-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; VARIANT5-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; VARIANT5-NEXT: s_wait_kmcnt 0x0 -; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s2 -; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT5-NEXT: v_xad_u32 v1, v0, -1, s6 +; VARIANT5-NEXT: global_store_b32 v3, v0, s[4:5] ; VARIANT5-NEXT: s_barrier_signal -1 ; VARIANT5-NEXT: s_barrier_wait -1 ; VARIANT5-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; VARIANT5-NEXT: v_lshlrev_b64_e32 v[1:2], 2, v[1:2] -; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s0, v1 +; VARIANT5-NEXT: v_add_co_u32 v1, vcc_lo, s4, v1 ; VARIANT5-NEXT: s_delay_alu instid0(VALU_DEP_2) -; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s1, v2, vcc_lo +; VARIANT5-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, s5, v2, vcc_lo ; VARIANT5-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT5-NEXT: s_wait_loadcnt 0x0 -; VARIANT5-NEXT: global_store_b32 v3, v0, s[0:1] +; VARIANT5-NEXT: global_store_b32 v3, v0, s[4:5] ; VARIANT5-NEXT: s_nop 0 ; VARIANT5-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; VARIANT5-NEXT: s_endpgm ; ; VARIANT6-LABEL: test_barrier: ; VARIANT6: ; %bb.0: ; %entry -; VARIANT6-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; VARIANT6-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; VARIANT6-NEXT: v_lshlrev_b32_e32 v5, 2, v0 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 -; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 -; VARIANT6-NEXT: v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0 -; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s2, v0 -; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] +; VARIANT6-NEXT: s_sub_co_i32 s0, s6, 1 +; VARIANT6-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_mov_b32 v4, s5 +; VARIANT6-NEXT: v_sub_nc_u32_e32 v1, s0, v0 +; VARIANT6-NEXT: global_store_b32 v5, v0, s[4:5] ; VARIANT6-NEXT: s_wait_storecnt 0x0 ; VARIANT6-NEXT: s_barrier_signal -1 ; VARIANT6-NEXT: s_barrier_wait -1 @@ -154,7 +154,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v4, v2, vcc_lo ; VARIANT6-NEXT: global_load_b32 v0, v[1:2], off ; VARIANT6-NEXT: s_wait_loadcnt 0x0 -; VARIANT6-NEXT: global_store_b32 v5, v0, s[0:1] +; VARIANT6-NEXT: global_store_b32 v5, v0, s[4:5] ; VARIANT6-NEXT: s_nop 0 ; VARIANT6-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; VARIANT6-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll index 38a34ec..3eb2261 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll @@ -518,33 +518,33 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test1_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_lshl_b32 s0, s6, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[4:5] +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_barrier_init -1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: global_store_b32 v3, v0, s[4:5] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test1_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s6 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[4:5] ; GLOBAL-ISEL-NEXT: s_barrier_init -1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[4:5] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -562,33 +562,33 @@ entry: define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test2_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_lshl_b32 s0, s6, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[4:5] +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_barrier_init 1 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: global_store_b32 v3, v0, s[4:5] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test2_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s6 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[4:5] ; GLOBAL-ISEL-NEXT: s_barrier_init 1 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[4:5] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -606,33 +606,33 @@ entry: define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 { ; GCN-LABEL: test3_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s2, s2, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_lshl_b32 s0, s6, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[4:5] +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_barrier_init 0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: global_store_b32 v3, v0, s[4:5] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test3_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_lshl_b32 m0, 16, s6 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[4:5] ; GLOBAL-ISEL-NEXT: s_barrier_init 0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[4:5] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -650,35 +650,35 @@ entry: define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 { ; GCN-LABEL: test4_s_barrier_init: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0 ; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_lshl_b32 s3, s3, 16 -; GCN-NEXT: global_store_b32 v3, v2, s[0:1] -; GCN-NEXT: s_or_b32 s2, s2, s3 -; GCN-NEXT: s_mov_b32 m0, s2 +; GCN-NEXT: s_lshl_b32 s0, s7, 16 +; GCN-NEXT: global_store_b32 v3, v2, s[4:5] +; GCN-NEXT: s_or_b32 s0, s6, s0 +; GCN-NEXT: s_mov_b32 m0, s0 ; GCN-NEXT: s_barrier_init m0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: global_store_b32 v3, v0, s[4:5] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_init: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_lshl_b32 s3, 16, s3 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] -; GLOBAL-ISEL-NEXT: s_or_b32 m0, s2, s3 +; GLOBAL-ISEL-NEXT: s_lshl_b32 s0, 16, s7 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[4:5] +; GLOBAL-ISEL-NEXT: s_or_b32 m0, s6, s0 ; GLOBAL-ISEL-NEXT: s_barrier_init m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[4:5] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -852,33 +852,33 @@ entry: define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_barrier_join_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: global_store_b32 v3, v1, s[0:1] +; GCN-NEXT: s_mov_b32 m0, s6 +; GCN-NEXT: global_store_b32 v3, v1, s[4:5] ; GCN-NEXT: s_barrier_join m0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: global_store_b32 v3, v0, s[4:5] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s6 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[4:5] ; GLOBAL-ISEL-NEXT: s_barrier_join m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[4:5] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -1098,33 +1098,33 @@ entry: define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_wakeup_barrier_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: global_store_b32 v3, v1, s[0:1] +; GCN-NEXT: s_mov_b32 m0, s6 +; GCN-NEXT: global_store_b32 v3, v1, s[4:5] ; GCN-NEXT: s_wakeup_barrier m0 -; GCN-NEXT: global_store_b32 v3, v0, s[0:1] +; GCN-NEXT: global_store_b32 v3, v0, s[4:5] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1] +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s6 +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[4:5] ; GLOBAL-ISEL-NEXT: s_wakeup_barrier m0 -; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1] +; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[4:5] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm @@ -1278,32 +1278,32 @@ entry: define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 { ; GCN-LABEL: test4_s_get_barrier_state_m0: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: s_mov_b32 m0, s2 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] -; GCN-NEXT: s_get_barrier_state s2, m0 +; GCN-NEXT: s_mov_b32 m0, s6 +; GCN-NEXT: global_store_b32 v0, v1, s[4:5] +; GCN-NEXT: s_get_barrier_state s0, m0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GCN-NEXT: v_mov_b32_e32 v1, s2 -; GCN-NEXT: global_store_b32 v0, v1, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: global_store_b32 v0, v1, s[4:5] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GCN-NEXT: s_endpgm ; ; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0: ; GLOBAL-ISEL: ; %bb.0: ; %entry -; GLOBAL-ISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GLOBAL-ISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 -; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, m0 +; GLOBAL-ISEL-NEXT: s_mov_b32 m0, s6 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[4:5] +; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, m0 ; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0 ; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2) -; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2 -; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0 +; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[4:5] ; GLOBAL-ISEL-NEXT: s_nop 0 ; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GLOBAL-ISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll index c2e74eb..673e9bf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx11.ll @@ -5,12 +5,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_add_nc_u32_e32 v32, s0, v40 -; GCN-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 +; GCN-NEXT: v_add_nc_u32_e32 v32, s2, v40 +; GCN-NEXT: v_dual_mov_b32 v81, s3 :: v_dual_add_nc_u32 v80, s3, v40 ; GCN-NEXT: ds_load_b128 v[4:7], v32 offset:16 ; GCN-NEXT: ds_load_b128 v[12:15], v32 offset:2064 ; GCN-NEXT: ds_load_b128 v[20:23], v32 offset:6160 @@ -72,12 +72,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v40, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s0, v40 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s1 :: v_dual_add_nc_u32 v80, s1, v40 +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v32, s2, v40 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v81, s3 :: v_dual_add_nc_u32 v80, s3, v40 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v32 offset:16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v32 offset:2064 ; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v32 offset:6160 @@ -175,12 +175,12 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GCN-NEXT: v_add_nc_u32_e32 v17, s0, v16 -; GCN-NEXT: v_add_nc_u32_e32 v16, s1, v16 +; GCN-NEXT: v_add_nc_u32_e32 v17, s2, v16 +; GCN-NEXT: v_add_nc_u32_e32 v16, s3, v16 ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; GCN-NEXT: ds_load_b128 v[0:3], v17 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) @@ -196,7 +196,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; GCN-NEXT: ds_store_b128 v16, v[8:11] ; GCN-NEXT: ds_load_b128 v[4:7], v17 offset:2064 ; GCN-NEXT: ds_load_b128 v[0:3], v17 offset:2048 -; GCN-NEXT: v_mov_b32_e32 v16, s1 +; GCN-NEXT: v_mov_b32_e32 v16, s3 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(0) @@ -256,12 +256,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_WMMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 5, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1) -; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s0, v16 -; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s1, v16 +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v17, s2, v16 +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v16, s3, v16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:16 ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) @@ -277,7 +277,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_WMMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_store_b128 v16, v[8:11] ; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v17 offset:2064 ; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v17 offset:2048 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v16, s3 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll index fdcb177..bc29441 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll @@ -7,13 +7,13 @@ declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16( define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; GCN-NEXT: v_mov_b32_e32 v48, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28 -; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 +; GCN-NEXT: v_add_nc_u32_e32 v0, s2, v28 +; GCN-NEXT: v_dual_mov_b32 v50, s3 :: v_dual_add_nc_u32 v49, s3, v28 ; GCN-NEXT: ds_load_b128 v[8:11], v0 ; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512 ; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536 @@ -58,13 +58,13 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 ; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2) -; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28 -; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28 +; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s2, v28 +; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s3 :: v_dual_add_nc_u32 v49, s3, v28 ; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0 ; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512 ; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536 @@ -147,11 +147,11 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v18, 0 ; GCN-NEXT: s_wait_kmcnt 0x0 -; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0 -; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s2 +; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s3 ; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024 ; GCN-NEXT: ds_load_b128 v[1:4], v17 ; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16 @@ -165,7 +165,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved( ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: ds_store_b128 v0, v[13:16] ; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560 -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; GCN-NEXT: s_wait_dscnt 0x0 @@ -210,11 +210,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved( ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0 ; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0 -; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s2 +; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s3 ; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024 ; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17 ; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16 @@ -228,7 +228,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved( ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] ; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560 -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll index 10f09b6..ae5b62f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll @@ -29,18 +29,18 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -82,32 +82,32 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; GCN-NEXT: v_mul_lo_u32 v30, v30, v30 ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; GCN-NEXT: s_endpgm ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(7) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 @@ -149,14 +149,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v30, v30, v30 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(30) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: s_endpgm @@ -178,17 +178,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; GCN-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v29, v29, v29 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 -; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v28, v28, v28 ; GCN-NEXT: v_mul_lo_u32 v31, v31, v31 @@ -198,12 +198,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 -; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; GCN-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -218,11 +218,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 -; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; GCN-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 ; GCN-NEXT: v_mul_lo_u32 v14, v14, v14 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 -; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; GCN-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -242,14 +242,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v21, v21, v21 ; GCN-NEXT: v_mul_lo_u32 v20, v20, v20 ; GCN-NEXT: v_mul_lo_u32 v16, v16, v16 -; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 -; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 +; GCN-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 +; GCN-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; GCN-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 +; GCN-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -258,17 +258,17 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v32, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[28:31], v32, s[4:5] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v32, s[4:5] offset:96 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v29, v29, v29 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[0:1] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v32, s[4:5] ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v28, v28, v28 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v31, v31, v31 @@ -278,12 +278,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v32, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v32, s[4:5] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -298,11 +298,11 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[16:19], v32, s[4:5] offset:80 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v14, v14, v14 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:64 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[20:23], v32, s[4:5] offset:64 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[24:27], v32, s[4:5] offset:32 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -322,14 +322,14 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v21, v21, v21 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v20, v20, v20 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v16, v16, v16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[2:3] offset:96 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[2:3] offset:80 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[2:3] +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[4:7], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[8:11], s[6:7] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[16:19], s[6:7] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[20:23], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[12:15], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[24:27], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[28:31], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v32, v[0:3], s[6:7] ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -381,18 +381,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; GCN-NEXT: ; kill: killed $sgpr0_sgpr1 +; GCN-NEXT: ; kill: killed $sgpr4_sgpr5 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_mul_lo_u32 v13, v13, v13 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v7, v7, v7 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v12, v12, v12 ; GCN-NEXT: v_mul_lo_u32 v15, v15, v15 @@ -403,25 +403,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v3, v3, v3 ; GCN-NEXT: v_mul_lo_u32 v2, v2, v2 ; GCN-NEXT: v_mul_lo_u32 v1, v1, v1 ; GCN-NEXT: v_mul_lo_u32 v0, v0, v0 -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -430,9 +430,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v6, v6, v6 ; GCN-NEXT: v_mul_lo_u32 v5, v5, v5 ; GCN-NEXT: v_mul_lo_u32 v4, v4, v4 -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -452,15 +452,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mul_lo_u32 v11, v11, v11 ; GCN-NEXT: v_mul_lo_u32 v10, v10, v10 ; GCN-NEXT: v_mul_lo_u32 v9, v9, v9 ; GCN-NEXT: v_mul_lo_u32 v8, v8, v8 -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 ; GCN-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -469,18 +469,18 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_alternating_READ_VALU_WRITE: ; EXACTCUTOFF: ; %bb.0: -; EXACTCUTOFF-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v16, 7, v0 -; EXACTCUTOFF-NEXT: ; kill: killed $sgpr0_sgpr1 +; EXACTCUTOFF-NEXT: ; kill: killed $sgpr4_sgpr5 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:48 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(1) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v13, v13, v13 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v7, v7, v7 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v12, v12, v12 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v15, v15, v15 @@ -491,25 +491,25 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:112 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:112 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:112 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:112 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] offset:96 ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v3, v3, v3 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v2, v2, v2 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v1, v1, v1 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v0, v0, v0 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:96 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] offset:96 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:48 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:48 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:64 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -518,9 +518,9 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v6, v6, v6 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v5, v5, v5 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v4, v4, v4 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:64 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:32 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:16 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:64 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:32 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:16 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -540,15 +540,15 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16 -; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:16 +; EXACTCUTOFF-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt vmcnt(0) ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v11, v11, v11 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v10, v10, v10 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v9, v9, v9 ; EXACTCUTOFF-NEXT: v_mul_lo_u32 v8, v8, v8 -; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:80 +; EXACTCUTOFF-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:80 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000040) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000020) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000002) size(2) SyncID(0) @@ -614,10 +614,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_alternating_READ_VA define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v1, s0, v0 +; GCN-NEXT: v_add_u32_e32 v1, s2, v0 ; GCN-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -661,7 +661,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; GCN-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; GCN-NEXT: v_mov_b32_e32 v2, 2.0 -; GCN-NEXT: v_add_u32_e32 v0, s1, v0 +; GCN-NEXT: v_add_u32_e32 v0, s3, v0 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; GCN-NEXT: s_waitcnt lgkmcnt(14) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -681,7 +681,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; GCN-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; GCN-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; GCN-NEXT: ds_write_b128 v0, a[0:3] -; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 ; GCN-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; GCN-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; GCN-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -720,10 +720,10 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_cluster: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v0, 7, v0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s0, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s2, v0 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v1 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v1 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v1 offset:80 @@ -767,7 +767,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_read_b128 a[136:139], v2 offset:57376 ; EXACTCUTOFF-NEXT: ds_read_b128 a[140:143], v2 offset:57392 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 2.0 -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s1, v0 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s3, v0 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(40) SyncID(0) ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(14) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31] @@ -787,7 +787,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_cluster(ptr ad ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[8:11] offset:32 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[4:7] offset:16 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[0:3] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s3 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[56:59] offset:8288 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[60:63] offset:8304 ; EXACTCUTOFF-NEXT: ds_write_b128 v0, a[48:51] offset:8256 @@ -862,12 +862,12 @@ entry: define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 { ; GCN-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; GCN-NEXT: v_mov_b32_e32 v2, 1.0 ; GCN-NEXT: v_mov_b32_e32 v3, 2.0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_add_u32_e32 v0, s0, v1 +; GCN-NEXT: v_add_u32_e32 v0, s2, v1 ; GCN-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; GCN-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; GCN-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -878,7 +878,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_add_u32_e32 v1, s1, v1 +; GCN-NEXT: v_add_u32_e32 v1, s3, v1 ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; GCN-NEXT: s_nop 7 @@ -902,7 +902,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; GCN-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s3 ; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) @@ -995,12 +995,12 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; ; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_MFMA_interleave: ; EXACTCUTOFF: ; %bb.0: ; %entry -; EXACTCUTOFF-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; EXACTCUTOFF-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v1, 7, v0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v2, 1.0 ; EXACTCUTOFF-NEXT: v_mov_b32_e32 v3, 2.0 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) -; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s0, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v0, s2, v1 ; EXACTCUTOFF-NEXT: ds_read_b128 a[28:31], v0 offset:112 ; EXACTCUTOFF-NEXT: ds_read_b128 a[24:27], v0 offset:96 ; EXACTCUTOFF-NEXT: ds_read_b128 a[20:23], v0 offset:80 @@ -1011,7 +1011,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[12:15], v0 offset:48 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s1, v1 +; EXACTCUTOFF-NEXT: v_add_u32_e32 v1, s3, v1 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) ; EXACTCUTOFF-NEXT: s_nop 7 @@ -1035,7 +1035,7 @@ define amdgpu_kernel void @test_sched_group_barrier_pipeline_MFMA_interleave(ptr ; EXACTCUTOFF-NEXT: ds_read_b128 a[0:3], v0 offset:8192 ; EXACTCUTOFF-NEXT: s_waitcnt lgkmcnt(0) ; EXACTCUTOFF-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v2, v3, a[0:31] -; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s1 +; EXACTCUTOFF-NEXT: v_mov_b32_e32 v1, s3 ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(8) SyncID(0) ; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll index fc33206..8f8994e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll @@ -211,21 +211,21 @@ define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> %in) { ; GCN-LABEL: set_inactive_v2i32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call <2 x i32> @llvm.amdgcn.set.inactive.v2i32(<2 x i32> %in, <2 x i32> ) #0 store <2 x i32> %tmp, ptr addrspace(1) %out @@ -235,21 +235,21 @@ define amdgpu_kernel void @set_inactive_v2i32(ptr addrspace(1) %out, <2 x i32> % define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GCN-LABEL: set_inactive_v2f32: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 1.0 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call <2 x float> @llvm.amdgcn.set.inactive.v2f32(<2 x float> %in, <2 x float> ) #0 store <2 x float> %tmp, ptr addrspace(1) %out @@ -279,21 +279,21 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> %in) { ; GCN-LABEL: set_inactive_v4i16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x10001 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call <4 x i16> @llvm.amdgcn.set.inactive.v4i16(<4 x i16> %in, <4 x i16> ) #0 store <4 x i16> %tmp, ptr addrspace(1) %out @@ -303,21 +303,21 @@ define amdgpu_kernel void @set_inactive_v4i16(ptr addrspace(1) %out, <4 x i16> % define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> %in) { ; GCN-LABEL: set_inactive_v4f16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3c003c00 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call <4 x half> @llvm.amdgcn.set.inactive.v4f16(<4 x half> %in, <4 x half> ) #0 store <4 x half> %tmp, ptr addrspace(1) %out @@ -327,21 +327,21 @@ define amdgpu_kernel void @set_inactive_v4f16(ptr addrspace(1) %out, <4 x half> define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in) { ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_mov_b32 s8, 0x3f803f80 -; GCN-NEXT: s_mov_b32 s7, 0xf000 -; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_mov_b32 s9, s8 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: s_mov_b32 s4, s0 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: s_mov_b32 s0, s4 +; GCN-NEXT: s_mov_b32 s1, s5 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_not_b64 exec, exec ; GCN-NEXT: v_mov_b32_e32 v0, s8 ; GCN-NEXT: v_mov_b32_e32 v1, s9 ; GCN-NEXT: s_not_b64 exec, exec -; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 store <4 x bfloat> %tmp, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll index 0755dcd..87c5f5b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll @@ -18,15 +18,15 @@ define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_bfe_u32 v0, v0, s3, s3 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_bfe_u32 v0, v0, s7, s7 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1) store i32 %bfe_u32, ptr addrspace(1) %out, align 4 @@ -50,16 +50,16 @@ define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_arg_imm: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v1, 0x7b -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_bfe_u32 v0, s6, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123) store i32 %bfe_u32, ptr addrspace(1) %out, align 4 @@ -83,16 +83,16 @@ define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, ; ; VI-LABEL: bfe_u32_arg_imm_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7b -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_bfe_u32 v0, s2, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_bfe_u32 v0, s6, v0, v1 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2) store i32 %bfe_u32, ptr addrspace(1) %out, align 4 @@ -117,17 +117,17 @@ define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, ; ; VI-LABEL: bfe_u32_imm_arg_arg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_movk_i32 s8, 0x7b -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_bfe_u32 v0, s8, v0, v1 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2) store i32 %bfe_u32, ptr addrspace(1) %out, align 4 @@ -1625,16 +1625,16 @@ define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 ; ; VI-LABEL: v_lshr_and: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_lshr_b32 s0, s2, s3 -; VI-NEXT: s_and_b32 s0, s0, 7 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_lshr_b32 s4, s6, s7 +; VI-NEXT: s_and_b32 s4, s4, 7 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %c = lshr i32 %a, %b %d = and i32 %c, 7 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll index 3a86787..4f65acd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll @@ -336,33 +336,33 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 +; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x39a3b295 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, s6, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 @@ -370,48 +370,48 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 ; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s4, s2, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s4, v1 -; VI-GISEL-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s4 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v1 +; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, s3, v5 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, s7, v5 ; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x39a3b295, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8a000, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v5 @@ -422,77 +422,77 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; VI-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc2ce8ed0 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x42b17218 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; VI-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -502,18 +502,18 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x42b17218 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll index a162949..ff20f90 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll @@ -338,33 +338,33 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) { define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; VI-SDAG-LABEL: s_exp10_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 -; VI-SDAG-NEXT: v_sub_f32_e32 v2, s3, v2 +; VI-SDAG-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: v_sub_f32_e32 v2, s7, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-SDAG-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x3a2784bc -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v0 -; VI-SDAG-NEXT: v_mul_f32_e32 v5, s4, v4 -; VI-SDAG-NEXT: s_and_b32 s4, s2, 0xfffff000 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s0, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v5, s0, v4 +; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000 ; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v1 -; VI-SDAG-NEXT: v_mov_b32_e32 v6, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v6, s0 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v1, v3 ; VI-SDAG-NEXT: v_add_f32_e32 v2, v5, v2 -; VI-SDAG-NEXT: v_sub_f32_e32 v6, s2, v6 +; VI-SDAG-NEXT: v_sub_f32_e32 v6, s6, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2 -; VI-SDAG-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v6 ; VI-SDAG-NEXT: v_mul_f32_e32 v6, 0x40549000, v6 ; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1 ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3 ; VI-SDAG-NEXT: v_rndne_f32_e32 v5, v0 ; VI-SDAG-NEXT: v_add_f32_e32 v6, v6, v7 -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s4, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v5 ; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v6 ; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4 @@ -372,48 +372,48 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v4, v5 ; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xc23369f4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v2 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v2 ; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0x7f800000 -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v4 -; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v2 +; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v2 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; ; VI-GISEL-LABEL: s_exp10_v2f32: ; VI-GISEL: ; %bb.0: -; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; VI-GISEL-NEXT: s_and_b32 s4, s2, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 -; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2 +; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_sub_f32_e32 v2, s6, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v2 ; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v3, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v4 -; VI-GISEL-NEXT: v_mul_f32_e32 v4, s4, v1 -; VI-GISEL-NEXT: s_and_b32 s4, s3, 0xfffff000 -; VI-GISEL-NEXT: v_mov_b32_e32 v5, s4 +; VI-GISEL-NEXT: v_mul_f32_e32 v4, s0, v1 +; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000 +; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2 ; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3 -; VI-GISEL-NEXT: v_sub_f32_e32 v5, s3, v5 +; VI-GISEL-NEXT: v_sub_f32_e32 v5, s7, v5 ; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4 ; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3a2784bc, v5 ; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x40549000, v5 ; VI-GISEL-NEXT: v_add_f32_e32 v2, v3, v2 -; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0 +; VI-GISEL-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v6 -; VI-GISEL-NEXT: v_mul_f32_e32 v1, s4, v1 +; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1 ; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; VI-GISEL-NEXT: v_exp_f32_e32 v2, v2 ; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v5 @@ -424,77 +424,77 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; VI-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; VI-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0xc23369f4 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3 ; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x421a209b ; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; VI-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v4 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; VI-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v3 +; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v3 ; VI-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v4 -; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v4 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4 ; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc -; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5 ; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-GISEL-NEXT: s_endpgm ; ; GFX900-SDAG-LABEL: s_exp10_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s3, v0 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v0, -v2 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v0, -v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_fma_f32 v4, s3, v1, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s2, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v4, s7, v1, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s6, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v2, v2, v4 ; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v6 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v0, -v6 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v8, v6, v7 -; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0 +; GFX900-SDAG-NEXT: v_fma_f32 v0, s6, v1, v0 ; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v8, v0 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0 ; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v6, v7 ; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s3, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GFX900-SDAG-NEXT: v_mov_b32_e32 v7, 0x7f800000 -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s3, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v3 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v6 -; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v5 +; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc -; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v3 +; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v3 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc -; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp10_v2f32: ; GFX900-GISEL: ; %bb.0: -; GFX900-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v6, 0x7f800000 ; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v0, -v2 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s6, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v0, -v2 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v4, v2 -; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s3, v0 -; GFX900-GISEL-NEXT: v_fma_f32 v3, s2, v1, v3 +; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s7, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v3, s6, v1, v3 ; GFX900-GISEL-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v0, -v5 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v0, -v5 ; GFX900-GISEL-NEXT: v_add_f32_e32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_fma_f32 v0, s3, v1, v0 +; GFX900-GISEL-NEXT: v_fma_f32 v0, s7, v1, v0 ; GFX900-GISEL-NEXT: v_rndne_f32_e32 v1, v5 ; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4 ; GFX900-GISEL-NEXT: v_exp_f32_e32 v2, v2 @@ -504,18 +504,18 @@ define amdgpu_kernel void @s_exp10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX900-GISEL-NEXT: v_exp_f32_e32 v5, v0 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4 ; GFX900-GISEL-NEXT: v_ldexp_f32 v2, v2, v3 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v4 ; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x421a209b ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc ; GFX900-GISEL-NEXT: v_ldexp_f32 v1, v5, v1 -; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v4 +; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4 ; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc -; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s3, v3 +; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v3 ; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX900-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX900-GISEL-NEXT: s_endpgm ; ; SI-SDAG-LABEL: s_exp10_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll index 36e7897..06fa910 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll @@ -222,25 +222,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_exp2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; VI-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 +; VI-SDAG-NEXT: v_add_f32_e32 v1, s6, v1 ; VI-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; VI-SDAG-NEXT: v_exp_f32_e32 v2, v1 ; VI-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 ; VI-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; @@ -270,25 +270,25 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_exp2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x42800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc -; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_add_f32_e32 v4, s7, v4 +; GFX900-SDAG-NEXT: v_add_f32_e32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_mul_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_exp2_v2f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index d847af7..ad70589 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -568,15 +568,15 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; ; GFX1100-SDAG-LABEL: s_log_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 @@ -589,29 +589,29 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s1 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| ; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 -; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 @@ -624,14 +624,14 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) { ; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x41b17218, s1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| ; GFX1100-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 3f060de..82c73fa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -568,15 +568,15 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log10_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s3, v0 :: v_dual_mul_f32 v1, s2, v1 +; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 @@ -589,29 +589,29 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-SDAG-NEXT: v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1100-SDAG-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s1 ; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| ; GFX1100-SDAG-NEXT: v_dual_cndmask_b32 v2, v1, v3 :: v_dual_mov_b32 v3, 0 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v0, v4 :: v_dual_sub_f32 v0, v2, v5 -; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b64 v3, v[0:1], s[4:5] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log10_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 @@ -624,14 +624,14 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; GFX1100-GISEL-NEXT: v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1100-GISEL-NEXT: v_dual_add_f32 v2, v2, v4 :: v_dual_add_f32 v3, v3, v5 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x411a209b, s1 ; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v1| ; GFX1100-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, v1, v3 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index 035b243..b76e621 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -265,25 +265,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; VI-SDAG-LABEL: s_log2_v2f32: ; VI-SDAG: ; %bb.0: -; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; VI-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; VI-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; VI-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1 ; VI-SDAG-NEXT: v_log_f32_e32 v4, v4 ; VI-SDAG-NEXT: v_log_f32_e32 v2, v1 ; VI-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 ; VI-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1 -; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, s4 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5 ; VI-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-SDAG-NEXT: s_endpgm ; @@ -313,25 +313,25 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX900-SDAG-LABEL: s_log2_v2f32: ; GFX900-SDAG: ; %bb.0: -; GFX900-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0x4f800000 ; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s7, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v4, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GFX900-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0 ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc ; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc -; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s3, v4 -; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v4, s7, v4 +; GFX900-SDAG-NEXT: v_mul_f32_e32 v1, s6, v1 ; GFX900-SDAG-NEXT: v_log_f32_e32 v4, v4 ; GFX900-SDAG-NEXT: v_log_f32_e32 v2, v1 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v1, v4, v3 ; GFX900-SDAG-NEXT: v_sub_f32_e32 v0, v2, v0 -; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] +; GFX900-SDAG-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] ; GFX900-SDAG-NEXT: s_endpgm ; ; GFX900-GISEL-LABEL: s_log2_v2f32: @@ -359,49 +359,49 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in) ; ; GFX1100-SDAG-LABEL: s_log2_v2f32: ; GFX1100-SDAG: ; %bb.0: -; GFX1100-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-SDAG-NEXT: v_mov_b32_e32 v4, 0 ; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s3 -; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s2 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s7 +; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s6 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s5 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s4 -; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s5 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s1 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s0 +; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s1 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s3, v1 -; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s7, v1 +; GFX1100-SDAG-NEXT: v_mul_f32_e32 v3, s6, v3 ; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3 ; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v1, v0 :: v_dual_sub_f32 v0, v3, v2 -; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[0:1] +; GFX1100-SDAG-NEXT: global_store_b64 v4, v[0:1], s[4:5] ; GFX1100-SDAG-NEXT: s_nop 0 ; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-SDAG-NEXT: s_endpgm ; ; GFX1100-GISEL-LABEL: s_log2_v2f32: ; GFX1100-GISEL: ; %bb.0: -; GFX1100-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2 -; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s5, 0x800000, s3 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s0, 0x800000, s6 +; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s1, 0x800000, s7 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s5 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s4 -; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s5 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s1 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x42000000, s0 +; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s1 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_mul_f32 v1, s3, v1 +; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s7, v1 ; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0 ; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1 ; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff ; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v2 :: v_dual_sub_f32 v1, v1, v3 ; GFX1100-GISEL-NEXT: v_mov_b32_e32 v2, 0 -; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1100-GISEL-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX1100-GISEL-NEXT: s_nop 0 ; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1100-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll index 826862e..9fcbdf3 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll @@ -224,11 +224,11 @@ define amdgpu_kernel void @local_size_yz(ptr addrspace(1) %out) { ; ; VI-LABEL: local_size_yz: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x1c +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x1c ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_mul_i32 s0, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll index 7ad7cc8..c5d2f79 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll @@ -135,54 +135,54 @@ define amdgpu_kernel void @round_v2f32(ptr addrspace(1) %out, <2 x float> %in) # ; ; GFX89-LABEL: round_v2f32: ; GFX89: ; %bb.0: -; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX89-NEXT: s_brev_b32 s8, -2 -; GFX89-NEXT: s_mov_b32 s7, 0xf000 -; GFX89-NEXT: s_mov_b32 s6, -1 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: v_trunc_f32_e32 v0, s3 -; GFX89-NEXT: v_sub_f32_e32 v1, s3, v0 -; GFX89-NEXT: s_mov_b32 s4, s0 -; GFX89-NEXT: s_mov_b32 s5, s1 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v2, s3 +; GFX89-NEXT: v_trunc_f32_e32 v0, s7 +; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5] +; GFX89-NEXT: v_mov_b32_e32 v2, s7 ; GFX89-NEXT: v_bfi_b32 v1, s8, v1, v2 ; GFX89-NEXT: v_add_f32_e32 v1, v0, v1 -; GFX89-NEXT: v_trunc_f32_e32 v0, s2 -; GFX89-NEXT: v_sub_f32_e32 v2, s2, v0 -; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5 -; GFX89-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[0:1] -; GFX89-NEXT: v_mov_b32_e32 v3, s2 +; GFX89-NEXT: v_trunc_f32_e32 v0, s6 +; GFX89-NEXT: v_sub_f32_e32 v2, s6, v0 +; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5 +; GFX89-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s[4:5] +; GFX89-NEXT: v_mov_b32_e32 v3, s6 ; GFX89-NEXT: v_bfi_b32 v2, s8, v2, v3 ; GFX89-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; ; GFX11-LABEL: round_v2f32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_trunc_f32_e32 v0, s3 -; GFX11-NEXT: v_trunc_f32_e32 v2, s2 +; GFX11-NEXT: v_trunc_f32_e32 v0, s7 +; GFX11-NEXT: v_trunc_f32_e32 v2, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_sub_f32_e32 v1, s3, v0 -; GFX11-NEXT: v_sub_f32_e32 v3, s2, v2 +; GFX11-NEXT: v_sub_f32_e32 v1, s7, v0 +; GFX11-NEXT: v_sub_f32_e32 v3, s6, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v1|, 0.5 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s4 +; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v1|, 0.5 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cmp_ge_f32_e64 s4, |v3|, 0.5 -; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s3 +; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v3|, 0.5 +; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s4 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0 +; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: v_add_f32_e32 v1, v0, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s2 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, s6 +; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: v_add_f32_e32 v0, v2, v3 -; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll index 994ef22..6707132 100644 --- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -8,28 +8,28 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_lshr_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_pk_lshrrev_b16 v1, s3, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s7, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_lshr_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_and_b32 s4, s2, 0xffff -; VI-NEXT: s_lshr_b32 s2, s2, 16 -; VI-NEXT: s_lshr_b32 s5, s3, 16 -; VI-NEXT: s_lshr_b32 s2, s2, s5 -; VI-NEXT: s_lshr_b32 s3, s4, s3 -; VI-NEXT: s_lshl_b32 s2, s2, 16 -; VI-NEXT: s_or_b32 s2, s3, s2 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_lshr_b32 s1, s6, 16 +; VI-NEXT: s_lshr_b32 s2, s7, 16 +; VI-NEXT: s_and_b32 s0, s6, 0xffff +; VI-NEXT: s_lshr_b32 s1, s1, s2 +; VI-NEXT: s_lshr_b32 s0, s0, s7 +; VI-NEXT: s_lshl_b32 s1, s1, 16 +; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -54,20 +54,20 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, < ; ; GFX10-LABEL: s_lshr_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshrrev_b16 v1, s3, s2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_pk_lshrrev_b16 v1, s7, s6 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_lshr_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_lshrrev_b16 v1, s3, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_pk_lshrrev_b16 v1, s7, s6 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll index 9ec37a5..def0dfa 100644 --- a/llvm/test/CodeGen/AMDGPU/madak.ll +++ b/llvm/test/CodeGen/AMDGPU/madak.ll @@ -190,11 +190,11 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX8-LABEL: madak_2_use_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v6 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc @@ -206,8 +206,8 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_load_dword v4, v[4:5] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v6 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v6 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x41200000 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: v_madak_f32 v6, v7, v8, 0x41200000 @@ -220,61 +220,61 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX9-LABEL: madak_2_use_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX9-NEXT: v_mac_f32_e32 v4, v1, v3 -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v4, s[2:3] offset:4 +; GFX9-NEXT: global_store_dword v0, v4, s[6:7] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_2_use_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-MAD-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-MAD-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v2, v1, v2, 0x41200000 ; GFX10-MAD-NEXT: v_madak_f32 v1, v1, v3, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v2, s[4:5] ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[6:7] offset:4 ; GFX10-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_2_use_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[6:7] offset:4 glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-MAD-NEXT: global_load_b32 v3, v0, s[6:7] offset:8 glc dlc ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v2, v1, v2 ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v3 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_dual_add_f32 v1, 0x41200000, v1 :: v_dual_add_f32 v2, 0x41200000, v2 -; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-MAD-NEXT: global_store_b32 v0, v2, s[4:5] dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc +; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[6:7] offset:4 dlc ; GFX11-MAD-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-MAD-NEXT: s_nop 0 ; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -282,59 +282,59 @@ define amdgpu_kernel void @madak_2_use_f32(ptr addrspace(1) noalias %out, ptr ad ; ; GFX940-FMA-LABEL: madak_2_use_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: v_mov_b32_e32 v4, 0x41200000 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 sc0 sc1 +; GFX940-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX940-FMA-NEXT: v_fmac_f32_e32 v4, v1, v3 -; GFX940-FMA-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX940-FMA-NEXT: global_store_dword v0, v4, s[2:3] offset:4 sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v4, s[6:7] offset:4 sc0 sc1 ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_2_use_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc +; GFX10-FMA-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v3, v0, s[2:3] offset:8 glc dlc +; GFX10-FMA-NEXT: global_load_dword v3, v0, s[6:7] offset:8 glc dlc ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX10-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v2, s[4:5] ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[2:3] offset:4 +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[6:7] offset:4 ; GFX10-FMA-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_2_use_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc +; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7] offset:4 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3] offset:8 glc dlc +; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[6:7] offset:8 glc dlc ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmaak_f32 v2, v1, v2, 0x41200000 ; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v3, 0x41200000 -; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5] dlc ; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[2:3] offset:4 dlc +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[6:7] offset:4 dlc ; GFX11-FMA-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -381,15 +381,15 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX8-LABEL: madak_m_inline_imm_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v3, v[0:1] -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_madak_f32 v2, 4.0, v3, 0x41200000 @@ -398,72 +398,72 @@ define amdgpu_kernel void @madak_m_inline_imm_f32(ptr addrspace(1) noalias %out, ; ; GFX9-LABEL: madak_m_inline_imm_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: madak_m_inline_imm_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-MAD-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX10-MAD-NEXT: v_madak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-MAD-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: madak_m_inline_imm_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) ; GFX11-MAD-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1 -; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-MAD-NEXT: s_nop 0 ; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: madak_m_inline_imm_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX940-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX940-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX940-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: madak_m_inline_imm_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-FMA-NEXT: global_load_dword v1, v0, s[6:7] ; GFX10-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX10-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-FMA-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: madak_m_inline_imm_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] ; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) ; GFX11-FMA-NEXT: v_fmaak_f32 v1, 4.0, v1, 0x41200000 -; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm @@ -919,78 +919,78 @@ define amdgpu_kernel void @s_s_madak_f32(ptr addrspace(1) %out, float %a, float ; ; GFX8-LABEL: s_s_madak_f32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mac_f32_e32 v2, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_mac_f32_e32 v2, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_s_madak_f32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_mac_f32_e32 v1, s2, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mac_f32_e32 v1, s6, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-MAD-LABEL: s_s_madak_f32: ; GFX10-MAD: ; %bb.0: -; GFX10-MAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-MAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-MAD-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-MAD-NEXT: v_madak_f32 v0, s2, v0, 0x41200000 -; GFX10-MAD-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-MAD-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-MAD-NEXT: v_madak_f32 v0, s6, v0, 0x41200000 +; GFX10-MAD-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-MAD-NEXT: s_endpgm ; ; GFX11-MAD-LABEL: s_s_madak_f32: ; GFX11-MAD: ; %bb.0: -; GFX11-MAD-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s2, s3 +; GFX11-MAD-NEXT: v_mul_f32_e64 v0, s6, s7 ; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-MAD-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, 0x41200000, v0 -; GFX11-MAD-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-MAD-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-MAD-NEXT: s_nop 0 ; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-MAD-NEXT: s_endpgm ; ; GFX940-FMA-LABEL: s_s_madak_f32: ; GFX940-FMA: ; %bb.0: -; GFX940-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX940-FMA-NEXT: v_mov_b32_e32 v1, 0x41200000 ; GFX940-FMA-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s3 -; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s2, v2 -; GFX940-FMA-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1 +; GFX940-FMA-NEXT: v_mov_b32_e32 v2, s7 +; GFX940-FMA-NEXT: v_fmac_f32_e32 v1, s6, v2 +; GFX940-FMA-NEXT: global_store_dword v0, v1, s[4:5] sc0 sc1 ; GFX940-FMA-NEXT: s_endpgm ; ; GFX10-FMA-LABEL: s_s_madak_f32: ; GFX10-FMA: ; %bb.0: -; GFX10-FMA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-FMA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-FMA-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s3 -; GFX10-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 -; GFX10-FMA-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-FMA-NEXT: v_mov_b32_e32 v0, s7 +; GFX10-FMA-NEXT: v_fmaak_f32 v0, s6, v0, 0x41200000 +; GFX10-FMA-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-FMA-NEXT: s_endpgm ; ; GFX11-FMA-LABEL: s_s_madak_f32: ; GFX11-FMA: ; %bb.0: -; GFX11-FMA-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX11-FMA-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s7 ; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FMA-NEXT: v_fmaak_f32 v0, s2, v0, 0x41200000 -; GFX11-FMA-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-FMA-NEXT: v_fmaak_f32 v0, s6, v0, 0x41200000 +; GFX11-FMA-NEXT: global_store_b32 v1, v0, s[4:5] ; GFX11-FMA-NEXT: s_nop 0 ; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-FMA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 940287d..5c88328 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -5,41 +5,41 @@ define amdgpu_kernel void @vector_clause(ptr addrspace(1) noalias nocapture readonly %arg, ptr addrspace(1) noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GCN-NEXT: s_endpgm ; ; GCN-SCRATCH-LABEL: vector_clause: ; GCN-SCRATCH: ; %bb.0: ; %bb -; GCN-SCRATCH-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-SCRATCH-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GCN-SCRATCH-NEXT: s_clause 0x3 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[0:1] -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 -; GCN-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v16, s[4:5] +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[4:5] offset:16 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[4:5] offset:32 +; GCN-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[4:5] offset:48 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(3) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(2) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:16 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:32 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 ; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3] offset:48 +; GCN-SCRATCH-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 ; GCN-SCRATCH-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir index b08da2e..a87503c 100644 --- a/llvm/test/CodeGen/AMDGPU/merge-s-load.mir +++ b/llvm/test/CodeGen/AMDGPU/merge-s-load.mir @@ -8,9 +8,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x1_x1 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; CHECK-NEXT: early-clobber %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %3.sub0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub1 %0:sgpr_64 = IMPLICIT_DEF %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s32)) @@ -48,16 +48,16 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_LOAD_DWORDX2_IMM]].sub0 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1 + ; GFX11-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32_xm0_xexec = COPY %4.sub0 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %4.sub1 ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 + ; GFX12-NEXT: early-clobber %5:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %5.sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %5.sub2 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 %0:sgpr_64 = IMPLICIT_DEF @@ -72,9 +72,9 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; GFX11-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64_xexec = COPY %7.sub0_sub1 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_64_xexec = COPY killed %7.sub2_sub3 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY]].sub0 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub1 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY1]].sub0 @@ -82,9 +82,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 + ; GFX12-NEXT: early-clobber %7:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %7.sub0_sub1_sub2 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %7.sub3 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub2 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 @@ -102,9 +102,9 @@ body: | bb.0: ; GFX11-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX11-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX11-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 + ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_64_xexec = COPY [[COPY]].sub0_sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY killed [[COPY]].sub2_sub3 ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[COPY2]].sub0 @@ -120,9 +120,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x1_x1_x1_x1_x1_x1_x1_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; GFX12-NEXT: early-clobber %15:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 4) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %15.sub0_sub1_sub2_sub3 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %15.sub4_sub5_sub6_sub7 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_96 = COPY [[COPY]].sub0_sub1_sub2 ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[COPY]].sub3 ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]].sub0_sub1 @@ -157,9 +157,9 @@ body: | ; ; GFX12-LABEL: name: merge_s_load_x2_x1 ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; GFX12-NEXT: [[S_LOAD_DWORDX3_IMM:%[0-9]+]]:sgpr_96 = S_LOAD_DWORDX3_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) - ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX3_IMM]].sub0_sub1 - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX3_IMM]].sub2 + ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) @@ -171,9 +171,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x2_x2 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed [[S_LOAD_DWORDX4_IMM]].sub2_sub3 + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) %2:sgpr_64 = S_LOAD_DWORDX2_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) @@ -185,9 +185,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x2_x2_x2_x2 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 @@ -205,9 +205,9 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x3_x1 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1_sub2 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed [[S_LOAD_DWORDX4_IMM]].sub3 + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_96 = S_LOAD_DWORDX3_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) @@ -219,10 +219,118 @@ body: | bb.0: ; CHECK-LABEL: name: merge_s_load_x4_x4 ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF - ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) - ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX8_IMM]].sub0_sub1_sub2_sub3 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed [[S_LOAD_DWORDX8_IMM]].sub4_sub5_sub6_sub7 + ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 %0:sgpr_64 = IMPLICIT_DEF %1:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) %2:sgpr_128 = S_LOAD_DWORDX4_IMM %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) ... + +# The constrained multi-dword scalar load merge tests. +--- +name: merge_s_load_x1_x2ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x1_x2ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s64)) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x1_x3ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x1_x3ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 4, 0 :: (dereferenceable invariant load (s96), align 16) + %0:sgpr_64 = IMPLICIT_DEF + %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s32)) + early-clobber %2:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 4, 0 :: (dereferenceable invariant load (s96)) +... + +--- +name: merge_s_load_x2ec_x1 +body: | + bb.0: + ; GFX11-LABEL: name: merge_s_load_x2ec_x1 + ; GFX11: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX11-NEXT: early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s64)) + ; GFX11-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[DEF]], 8, 0 :: (dereferenceable invariant load (s32)) + ; + ; GFX12-LABEL: name: merge_s_load_x2ec_x1 + ; GFX12: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; GFX12-NEXT: early-clobber %3:sgpr_96 = S_LOAD_DWORDX3_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s96), align 8) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub2 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: merge_s_load_x2ec_x2ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY %3.sub0_sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_64 = COPY killed %3.sub2_sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x2ec_x2ec_x2ec_x2ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x2ec_x2ec_x2ec_x2ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %7:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 8) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %7.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %7.sub4_sub5_sub6_sub7 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_64 = COPY [[COPY]].sub0_sub1 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_64 = COPY killed [[COPY]].sub2_sub3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_64 = COPY [[COPY1]].sub0_sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY killed [[COPY1]].sub2_sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s64)) + early-clobber %2:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 8, 0 :: (dereferenceable invariant load (s64)) + early-clobber %3:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s64)) + early-clobber %4:sgpr_64 = S_LOAD_DWORDX2_IMM_ec %0:sgpr_64, 24, 0 :: (dereferenceable invariant load (s64)) +... + +--- +name: merge_s_load_x3ec_x1 +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x3ec_x1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s128)) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_96 = COPY %3.sub0_sub1_sub2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY killed %3.sub3 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_96 = S_LOAD_DWORDX3_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s96)) + %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0:sgpr_64, 12, 0 :: (dereferenceable invariant load (s32)) +... + +--- +name: merge_s_load_x4ec_x4ec +body: | + bb.0: + ; CHECK-LABEL: name: merge_s_load_x4ec_x4ec + ; CHECK: [[DEF:%[0-9]+]]:sgpr_64 = IMPLICIT_DEF + ; CHECK-NEXT: early-clobber %3:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[DEF]], 0, 0 :: (dereferenceable invariant load (s256), align 16) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %3.sub0_sub1_sub2_sub3 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_128 = COPY killed %3.sub4_sub5_sub6_sub7 + %0:sgpr_64 = IMPLICIT_DEF + early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 0, 0 :: (dereferenceable invariant load (s128)) + early-clobber %2:sgpr_128 = S_LOAD_DWORDX4_IMM_ec %0:sgpr_64, 16, 0 :: (dereferenceable invariant load (s128)) +... diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 9dafa27..2334543 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -185,13 +185,13 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_sle_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -257,13 +257,13 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_imin_sle_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -808,11 +808,11 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16 ; ; GFX11-LABEL: s_test_imin_sle_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_min_i16 v1, s2, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_pk_min_i16 v1, s6, s7 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -1324,13 +1324,13 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_imin_slt_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_i32 s2, s2, s3 +; GFX11-NEXT: s_min_i32 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2084,13 +2084,13 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ule_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_u32 s2, s2, s3 +; GFX11-NEXT: s_min_u32 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2405,13 +2405,13 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3 ; ; GFX11-LABEL: s_test_umin_ult_i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_u32 s2, s2, s3 +; GFX11-NEXT: s_min_u32 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -2763,13 +2763,13 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32 ; ; GFX11-LABEL: s_test_umin_ult_v1i32: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_min_u32 s2, s2, s3 +; GFX11-NEXT: s_min_u32 s0, s6, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll index 9d6e092..c98cfa0 100644 --- a/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-ctlz-cttz.ll @@ -8,17 +8,17 @@ declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -40,23 +40,23 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -76,7 +76,7 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone @@ -87,17 +87,17 @@ define amdgpu_kernel void @ctlz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: ctlz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -120,23 +120,23 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v2, v2, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: ctlz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] offset:2 -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:4 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] offset:2 +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:4 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(5) @@ -157,7 +157,7 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v2, v0 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 false) nounwind readnone @@ -168,17 +168,17 @@ define amdgpu_kernel void @ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64_poison: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -200,23 +200,23 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX9-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64_poison: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -238,7 +238,7 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad ; GFX10-NEXT: v_ffbl_b32_e32 v2, v2 ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone @@ -249,17 +249,17 @@ define amdgpu_kernel void @cttz_i64_poison(ptr addrspace(1) noalias %out, ptr ad define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind { ; GFX9-LABEL: cttz_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX9-NEXT: global_load_ubyte v2, v1, s[2:3] offset:6 -; GFX9-NEXT: global_load_ubyte v3, v1, s[2:3] offset:7 -; GFX9-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX9-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX9-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX9-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX9-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX9-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX9-NEXT: global_load_ubyte v2, v1, s[6:7] offset:6 +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] offset:7 +; GFX9-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX9-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX9-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX9-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX9-NEXT: s_waitcnt vmcnt(5) @@ -282,23 +282,23 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX9-NEXT: v_add_u32_e64 v0, v0, 32 clamp ; GFX9-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX9-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: cttz_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_ubyte v0, v1, s[2:3] offset:5 -; GFX10-NEXT: global_load_ubyte v2, v1, s[2:3] offset:7 -; GFX10-NEXT: global_load_ubyte v3, v1, s[2:3] offset:6 -; GFX10-NEXT: global_load_ubyte v4, v1, s[2:3] offset:1 -; GFX10-NEXT: global_load_ubyte v5, v1, s[2:3] offset:3 -; GFX10-NEXT: global_load_ubyte v6, v1, s[2:3] offset:4 -; GFX10-NEXT: global_load_ubyte v7, v1, s[2:3] -; GFX10-NEXT: global_load_ubyte v8, v1, s[2:3] offset:2 +; GFX10-NEXT: global_load_ubyte v0, v1, s[6:7] offset:5 +; GFX10-NEXT: global_load_ubyte v2, v1, s[6:7] offset:7 +; GFX10-NEXT: global_load_ubyte v3, v1, s[6:7] offset:6 +; GFX10-NEXT: global_load_ubyte v4, v1, s[6:7] offset:1 +; GFX10-NEXT: global_load_ubyte v5, v1, s[6:7] offset:3 +; GFX10-NEXT: global_load_ubyte v6, v1, s[6:7] offset:4 +; GFX10-NEXT: global_load_ubyte v7, v1, s[6:7] +; GFX10-NEXT: global_load_ubyte v8, v1, s[6:7] offset:2 ; GFX10-NEXT: s_waitcnt vmcnt(7) ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX10-NEXT: s_waitcnt vmcnt(6) @@ -321,7 +321,7 @@ define amdgpu_kernel void @cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace ; GFX10-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp ; GFX10-NEXT: v_min_u32_e32 v0, v0, v2 ; GFX10-NEXT: v_min_u32_e32 v0, 64, v0 -; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX10-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm %val = load i64, ptr addrspace(1) %arrayidx, align 1 %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 false) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index b427204..3c60153 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -676,14 +676,17 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_sext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_ashr_i32 s3, s2, 31 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_ashr_i32 s1, s6, 31 +; GFX12-NEXT: s_mov_b32 s0, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s0, s4 +; GFX12-NEXT: s_mov_b32 s1, s5 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -784,13 +787,17 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) { ; ; GFX12-LABEL: mul64_zext_c: ; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 -; GFX12-NEXT: s_mov_b32 s3, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mul_u64 s[4:5], s[2:3], 0x50 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s1, 0 ; GFX12-NEXT: s_mov_b32 s3, 0x31016000 -; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX12-NEXT: s_mov_b32 s2, -1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, s6 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s0, s4 +; GFX12-NEXT: s_mov_b32 s1, s5 ; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll index 357b851..6d7bf002 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -24,32 +24,32 @@ define amdgpu_kernel void @test_smul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_smul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_bfe_i32 s0, s2, 0x180000 -; VI-NEXT: s_bfe_i32 s1, s3, 0x180000 -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_bfe_i32 s4, s6, 0x180000 +; VI-NEXT: s_bfe_i32 s5, s7, 0x180000 +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smul24_i32: @@ -113,30 +113,30 @@ define amdgpu_kernel void @test_smulhi24_i64(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_smulhi24_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s6, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_smulhi24_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_bfe_i32 s0, s2, 0x180000 -; GFX9-NEXT: s_bfe_i32 s1, s3, 0x180000 -; GFX9-NEXT: s_mul_hi_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_bfe_i32 s4, s6, 0x180000 +; GFX9-NEXT: s_bfe_i32 s5, s7, 0x180000 +; GFX9-NEXT: s_mul_hi_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; EG-LABEL: test_smulhi24_i64: diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll index 3a16c88..e6470a5 100644 --- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll @@ -24,32 +24,32 @@ define amdgpu_kernel void @test_umul24_i32(ptr addrspace(1) %out, i32 %a, i32 %b ; ; VI-LABEL: test_umul24_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_and_b32 s0, s2, 0xffffff -; VI-NEXT: s_and_b32 s1, s3, 0xffffff -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_and_b32 s4, s6, 0xffffff +; VI-NEXT: s_and_b32 s5, s7, 0xffffff +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umul24_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_i32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_i32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %0 = shl i32 %a, 8 @@ -392,30 +392,30 @@ define amdgpu_kernel void @test_umulhi24_i32_i64(ptr addrspace(1) %out, i32 %a, ; ; VI-LABEL: test_umulhi24_i32_i64: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s2, v0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: v_mov_b32_e32 v0, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s6, v0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi24_i32_i64: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: s_and_b32 s0, s2, 0xffffff -; GFX9-NEXT: s_and_b32 s1, s3, 0xffffff -; GFX9-NEXT: s_mul_hi_u32 s0, s0, s1 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: s_and_b32 s4, s6, 0xffffff +; GFX9-NEXT: s_and_b32 s5, s7, 0xffffff +; GFX9-NEXT: s_mul_hi_u32 s4, s4, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm entry: %a.24 = and i32 %a, 16777215 @@ -647,30 +647,30 @@ define amdgpu_kernel void @test_umulhi16_i32(ptr addrspace(1) %out, i32 %a, i32 ; ; VI-LABEL: test_umulhi16_i32: ; VI: ; %bb.0: ; %entry -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_and_b32 s0, s2, 0xffff -; VI-NEXT: s_and_b32 s1, s3, 0xffff -; VI-NEXT: s_mul_i32 s0, s0, s1 -; VI-NEXT: s_lshr_b32 s0, s0, 16 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_and_b32 s4, s6, 0xffff +; VI-NEXT: s_and_b32 s5, s7, 0xffff +; VI-NEXT: s_mul_i32 s4, s4, s5 +; VI-NEXT: s_lshr_b32 s4, s4, 16 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: test_umulhi16_i32: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s2, s2, 0xffff -; GFX9-NEXT: s_and_b32 s3, s3, 0xffff -; GFX9-NEXT: s_mul_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] +; GFX9-NEXT: s_and_b32 s0, s6, 0xffff +; GFX9-NEXT: s_and_b32 s1, s7, 0xffff +; GFX9-NEXT: s_mul_i32 s0, s0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm entry: %a.16 = and i32 %a, 65535 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 0473f80..65f4a1b 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -156,15 +156,15 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; GFX8-LABEL: scalar_or_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_or_b32 s0, s2, s3 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s0, s4 +; GFX8-NEXT: s_or_b32 s4, s6, s7 +; GFX8-NEXT: s_mov_b32 s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; EG-LABEL: scalar_or_i32: diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll index a3f7906..1899a0ab 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll @@ -4,17 +4,17 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -35,17 +35,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -67,17 +67,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -99,17 +99,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_broadcast_neg_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_broadcast_neg_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -132,17 +132,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_lo:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -163,17 +163,17 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_neg_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_neg_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 op_sel_hi:[1,1,0] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v3, v0, s[0:1] +; GCN-NEXT: global_store_dword v3, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -194,16 +194,16 @@ bb: define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: add_vector_neg_bitcast_scalar_lo: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: ds_read_u16 v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_add_u16 v0, v0, v1 op_sel_hi:[1,0] neg_lo:[0,1] neg_hi:[0,1] -; GCN-NEXT: global_store_dword v2, v0, s[0:1] +; GCN-NEXT: global_store_dword v2, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4 @@ -222,11 +222,11 @@ bb: define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_scalar_lo_neg_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: v_mov_b32_e32 v4, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -237,7 +237,7 @@ define amdgpu_kernel void @fma_vector_vector_scalar_lo_neg_scalar_hi(ptr addrspa ; GCN-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; GCN-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v1 -; GCN-NEXT: global_store_dword v4, v0, s[0:1] +; GCN-NEXT: global_store_dword v4, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 @@ -261,10 +261,10 @@ bb: define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspace(1) %out, ptr addrspace(3) %lds, ptr addrspace(3) %arg2) #0 { ; GCN-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: ds_read_b32 v2, v0 ; GCN-NEXT: ds_read_b32 v0, v0 offset:4 ; GCN-NEXT: ds_read_u16 v3, v1 @@ -273,7 +273,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_fma_f16 v0, v2, v0, v3 neg_lo:[0,0,1] neg_hi:[0,0,1] -; GCN-NEXT: global_store_dword v1, v0, s[0:1] +; GCN-NEXT: global_store_dword v1, v0, s[4:5] ; GCN-NEXT: s_endpgm bb: %lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll index e076df9..3f8b64b 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll @@ -720,13 +720,13 @@ define amdgpu_kernel void @v8i32_kernel_preload_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @v3i16_kernel_preload_arg(ptr addrspace(1) nocapture %out, <3 x i16> %in) #0 { ; GFX940-NO-PRELOAD-LABEL: v3i16_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3i16_kernel_preload_arg: @@ -934,13 +934,13 @@ define amdgpu_kernel void @v3f32_kernel_preload_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @v5i8_kernel_preload_arg(ptr addrspace(1) nocapture %out, <5 x i8> %in) #0 { ; GFX940-NO-PRELOAD-LABEL: v5i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_byte v0, v1, s[4:5] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v5i8_kernel_preload_arg: @@ -1172,11 +1172,11 @@ define amdgpu_kernel void @v5f64_kernel_preload_arg(ptr addrspace(1) nocapture % define amdgpu_kernel void @v8i8_kernel_preload_arg(ptr addrspace(1) %out, <8 x i8> %in) #0 { ; GFX940-NO-PRELOAD-LABEL: v8i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, 0 ; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[2:3] -; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b64_e32 v[0:1], s[6:7] +; GFX940-NO-PRELOAD-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v8i8_kernel_preload_arg: @@ -1592,13 +1592,13 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) %out, <2 define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) %out, <3 x bfloat> %in) #0 { ; GFX940-NO-PRELOAD-LABEL: v3bfloat_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v3bfloat_kernel_preload_arg: @@ -1983,14 +1983,14 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) %out, fp128 define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) %out, <7 x i8> %in) #0 { ; GFX940-NO-PRELOAD-LABEL: v7i8_kernel_preload_arg: ; GFX940-NO-PRELOAD: ; %bb.0: -; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0 +; GFX940-NO-PRELOAD-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0 ; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v0, 0 ; GFX940-NO-PRELOAD-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s3 -; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s2 -; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[0:1] offset:6 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[0:1] offset:4 sc0 sc1 -; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[0:1] sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v1, s7 +; GFX940-NO-PRELOAD-NEXT: v_mov_b32_e32 v2, s6 +; GFX940-NO-PRELOAD-NEXT: global_store_byte_d16_hi v0, v1, s[4:5] offset:6 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_short v0, v1, s[4:5] offset:4 sc0 sc1 +; GFX940-NO-PRELOAD-NEXT: global_store_dword v0, v2, s[4:5] sc0 sc1 ; GFX940-NO-PRELOAD-NEXT: s_endpgm ; ; GFX940-PRELOAD-2-LABEL: v7i8_kernel_preload_arg: diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll index 5bb260c..2ce0b9e 100644 --- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll +++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll @@ -5,28 +5,28 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr addrspace(8) noalias %b) { ; SDAG-LABEL: buffers_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GISEL-NEXT: s_endpgm %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0) %s0 = fmul float %l0, %l0 @@ -50,40 +50,40 @@ define amdgpu_kernel void @buffers_dont_alias(ptr addrspace(8) noalias %a, ptr a define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr noalias %b.flat) { ; SDAG-LABEL: buffers_from_flat_dont_alias: ; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; SDAG-NEXT: s_mov_b32 s7, 0 -; SDAG-NEXT: s_mov_b32 s6, 16 +; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; SDAG-NEXT: s_mov_b32 s3, 0 +; SDAG-NEXT: s_mov_b32 s2, 16 ; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: s_and_b32 s5, s1, 0xffff -; SDAG-NEXT: s_mov_b32 s4, s0 -; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; SDAG-NEXT: s_and_b32 s5, s3, 0xffff -; SDAG-NEXT: s_mov_b32 s4, s2 +; SDAG-NEXT: s_and_b32 s1, s5, 0xffff +; SDAG-NEXT: s_mov_b32 s0, s4 +; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; SDAG-NEXT: s_and_b32 s1, s7, 0xffff +; SDAG-NEXT: s_mov_b32 s0, s6 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0 ; SDAG-NEXT: v_mul_f32_e32 v1, v1, v1 ; SDAG-NEXT: v_mul_f32_e32 v2, v2, v2 ; SDAG-NEXT: v_mul_f32_e32 v3, v3, v3 -; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SDAG-NEXT: s_endpgm ; ; GISEL-LABEL: buffers_from_flat_dont_alias: ; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-NEXT: s_mov_b32 s7, 0 -; GISEL-NEXT: s_mov_b32 s6, 16 +; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-NEXT: s_mov_b32 s3, 0 +; GISEL-NEXT: s_mov_b32 s2, 16 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: s_and_b32 s5, s1, 0xffff -; GISEL-NEXT: s_mov_b32 s4, s0 -; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 -; GISEL-NEXT: s_and_b32 s5, s3, 0xffff -; GISEL-NEXT: s_mov_b32 s4, s2 +; GISEL-NEXT: s_and_b32 s1, s5, 0xffff +; GISEL-NEXT: s_mov_b32 s0, s4 +; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 +; GISEL-NEXT: s_and_b32 s1, s7, 0xffff +; GISEL-NEXT: s_mov_b32 s0, s6 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0 ; GISEL-NEXT: v_mul_f32_e32 v1, v1, v1 ; GISEL-NEXT: v_mul_f32_e32 v2, v2, v2 ; GISEL-NEXT: v_mul_f32_e32 v3, v3, v3 -; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GISEL-NEXT: s_endpgm %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0) %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll index a87973d..4a00473 100644 --- a/llvm/test/CodeGen/AMDGPU/rotl.ll +++ b/llvm/test/CodeGen/AMDGPU/rotl.ll @@ -35,35 +35,35 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotl_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s3, 32, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 +; GFX8-NEXT: s_sub_i32 s0, 32, s7 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotl_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_sub_i32 s3, 32, s3 -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_sub_i32 s0, 32, s7 +; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s0 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotl_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_sub_i32 s3, 32, s3 +; GFX11-NEXT: s_sub_i32 s0, 32, s7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s6, s6, s0 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 058ee58..d6431d7 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -32,31 +32,31 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) { ; ; GFX8-LABEL: rotr_i32: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v0, s7 +; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX10-LABEL: rotr_i32: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: v_alignbit_b32 v1, s6, s6, s7 +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: rotr_i32: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_alignbit_b32 v1, s6, s6, s7 +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll index b81af3e..1384fb0 100644 --- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -8,34 +8,34 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { ; GFX9-LABEL: s_shl_v2i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, s7, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_shl_v2i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_lshl_b32 s0, s0, s1 -; VI-NEXT: s_lshl_b32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s7, 16 +; VI-NEXT: s_lshl_b32 s4, s4, s5 +; VI-NEXT: s_lshl_b32 s5, s6, s7 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; CI-LABEL: s_shl_v2i16: @@ -59,26 +59,26 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 ; ; GFX10-LABEL: s_shl_v2i16: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_lshlrev_b16 v0, s7, s6 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_shl_v2i16: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_lshlrev_b16 v0, s3, s2 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: v_pk_lshlrev_b16 v0, s7, s6 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index 9a03d21..33249e4 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -19,16 +19,16 @@ define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_cmp_eq_u32 s6, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i32 @@ -96,17 +96,17 @@ define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i64: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: s_cmp_eq_u32 s6, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] ; VI-NEXT: v_mov_b32_e32 v1, v0 -; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i64 @@ -235,16 +235,16 @@ define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 % ; ; VI-LABEL: s_sext_i1_to_i16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, s3 -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] -; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 +; VI-NEXT: s_cmp_eq_u32 s6, s7 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm %cmp = icmp eq i32 %a, %b %sext = sext i1 %cmp to i16 diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index f88aaf3..23222a4 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -208,11 +208,11 @@ define amdgpu_kernel void @local_store_i48(ptr addrspace(3) %ptr, i48 %arg) #0 { ; GFX11-LABEL: local_store_i48: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 +; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-NEXT: ds_store_b16 v0, v1 offset:4 ; GFX11-NEXT: ds_store_b32 v0, v2 ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll index ded308a..45aa544 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.ll @@ -22,33 +22,33 @@ define amdgpu_kernel void @s_sub_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; GFX8-LABEL: s_sub_i32: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_sub_i32 s2, s2, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: s_sub_i32 s0, s6, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: s_sub_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_sub_i32 s2, s2, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_sub_i32 s0, s6, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX12-LABEL: s_sub_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_co_i32 s2, s2, s3 +; GFX12-NEXT: s_sub_co_i32 s0, s6, s7 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm @@ -95,12 +95,12 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) { ; ; GFX12-LABEL: s_sub_imm_i32: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x24 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_sub_co_i32 s2, 0x4d2, s2 +; GFX12-NEXT: s_sub_co_i32 s0, 0x4d2, s6 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 +; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 6ec213a..6c53afe 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -214,58 +214,58 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addr define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #1 { ; GFX9-LABEL: s_test_sub_v2i16_kernarg: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: v_pk_sub_i16 v0, s2, v0 -; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s7 +; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: s_mov_b32 s1, s5 +; GFX9-NEXT: v_pk_sub_i16 v0, s6, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: s_test_sub_v2i16_kernarg: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshr_b32 s0, s2, 16 -; VI-NEXT: s_lshr_b32 s1, s3, 16 -; VI-NEXT: s_sub_i32 s0, s0, s1 -; VI-NEXT: s_sub_i32 s1, s2, s3 -; VI-NEXT: s_lshl_b32 s0, s0, 16 -; VI-NEXT: s_and_b32 s1, s1, 0xffff -; VI-NEXT: s_or_b32 s0, s1, s0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s0, s4 +; VI-NEXT: s_mov_b32 s1, s5 +; VI-NEXT: s_lshr_b32 s4, s6, 16 +; VI-NEXT: s_lshr_b32 s5, s7, 16 +; VI-NEXT: s_sub_i32 s4, s4, s5 +; VI-NEXT: s_sub_i32 s5, s6, s7 +; VI-NEXT: s_lshl_b32 s4, s4, 16 +; VI-NEXT: s_and_b32 s5, s5, 0xffff +; VI-NEXT: s_or_b32 s4, s5, s4 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: s_test_sub_v2i16_kernarg: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: s_mov_b32 s7, 0x31016000 -; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX10-NEXT: s_mov_b32 s4, s0 -; GFX10-NEXT: s_mov_b32 s5, s1 -; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX10-NEXT: v_pk_sub_i16 v0, s6, s7 +; GFX10-NEXT: s_mov_b32 s0, s4 +; GFX10-NEXT: s_mov_b32 s1, s5 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_test_sub_v2i16_kernarg: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, -1 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_pk_sub_i16 v0, s2, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-NEXT: v_pk_sub_i16 v0, s6, s7 +; GFX11-NEXT: s_mov_b32 s0, s4 +; GFX11-NEXT: s_mov_b32 s1, s5 +; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll index f686aad..c7952f5 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv.ll @@ -218,35 +218,35 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; ; VI-LABEL: s_udiv_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; VI-NEXT: s_mov_b32 s7, 0xf000 -; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_u32_e32 v0, s3 -; VI-NEXT: s_sub_i32 s4, 0, s3 -; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: v_cvt_f32_u32_e32 v0, s7 +; VI-NEXT: s_sub_i32 s0, 0, s7 +; VI-NEXT: s_mov_b32 s1, s5 ; VI-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; VI-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; VI-NEXT: v_cvt_u32_f32_e32 v0, v0 -; VI-NEXT: v_mul_lo_u32 v1, s4, v0 -; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: v_mul_lo_u32 v1, s0, v0 +; VI-NEXT: s_mov_b32 s0, s4 ; VI-NEXT: v_mul_hi_u32 v1, v0, v1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; VI-NEXT: v_mul_hi_u32 v0, s2, v0 -; VI-NEXT: v_readfirstlane_b32 s0, v0 -; VI-NEXT: s_mul_i32 s0, s0, s3 -; VI-NEXT: s_sub_i32 s0, s2, s0 -; VI-NEXT: s_sub_i32 s1, s0, s3 +; VI-NEXT: v_mul_hi_u32 v0, s6, v0 +; VI-NEXT: v_readfirstlane_b32 s4, v0 +; VI-NEXT: s_mul_i32 s4, s4, s7 +; VI-NEXT: s_sub_i32 s4, s6, s4 +; VI-NEXT: s_sub_i32 s5, s4, s7 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; VI-NEXT: s_cmp_ge_u32 s0, s3 +; VI-NEXT: s_cmp_ge_u32 s4, s7 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NEXT: s_cselect_b32 s0, s1, s0 +; VI-NEXT: s_cselect_b32 s4, s5, s4 ; VI-NEXT: v_add_u32_e32 v1, vcc, 1, v0 -; VI-NEXT: s_cmp_ge_u32 s0, s3 +; VI-NEXT: s_cmp_ge_u32 s4, s7 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; ; GCN-LABEL: s_udiv_i32: diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index fc6df73..e5de7d0 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -122,38 +122,38 @@ define amdgpu_kernel void @v_cnd_nan(ptr addrspace(1) %out, i32 %c, float %f) #0 ; ; VI-LABEL: v_cnd_nan: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_cmp_eq_u32 s2, 0 -; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_cmp_eq_u32 s6, 0 +; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: s_cselect_b64 vcc, -1, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: v_cnd_nan: ; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_cmp_eq_u32 s2, 0 -; GFX10-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_cmp_eq_u32 s6, 0 +; GFX10-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1] +; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_cnd_nan: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_cmp_eq_u32 s2, 0 -; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GFX11-NEXT: s_cmp_eq_u32 s6, 0 +; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s3, s[4:5] -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v1, -1, s7, s[0:1] +; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index 89fef7e..7f69c47 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -89,110 +89,110 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg %src0ext, i32 inreg %src1ext) { ; SDAG-VI-LABEL: basic_smax_smin_sgpr: ; SDAG-VI: ; %bb.0: -; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff ; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0 -; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v1, s6, 0 +; SDAG-VI-NEXT: v_max_i16_e64 v2, s7, 0 ; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1 ; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0 -; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0 -; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1 +; SDAG-VI-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-VI-NEXT: v_mov_b32_e32 v1, s5 ; SDAG-VI-NEXT: flat_store_dword v[0:1], v2 ; SDAG-VI-NEXT: s_endpgm ; ; SDAG-GFX9-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX9: ; %bb.0: -; SDAG-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; SDAG-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0xff ; SDAG-GFX9-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-GFX9-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX9-NEXT: v_med3_i16 v2, s2, 0, v1 -; SDAG-GFX9-NEXT: v_med3_i16 v1, s3, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v2, s6, 0, v1 +; SDAG-GFX9-NEXT: v_med3_i16 v1, s7, 0, v1 ; SDAG-GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SDAG-GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 -; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; SDAG-GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; SDAG-GFX9-NEXT: s_endpgm ; ; SDAG-GFX11-LABEL: basic_smax_smin_sgpr: ; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; SDAG-GFX11-NEXT: v_mov_b32_e32 v2, 0 ; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-GFX11-NEXT: v_med3_i16 v0, s2, 0, 0xff -; SDAG-GFX11-NEXT: v_med3_i16 v1, s3, 0, 0xff +; SDAG-GFX11-NEXT: v_med3_i16 v0, s6, 0, 0xff +; SDAG-GFX11-NEXT: v_med3_i16 v1, s7, 0, 0xff ; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SDAG-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[4:5] ; SDAG-GFX11-NEXT: s_nop 0 ; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; SDAG-GFX11-NEXT: s_endpgm ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: -; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0 -; GISEL-VI-NEXT: s_sext_i32_i16 s5, 0xff +; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-VI-NEXT: s_sext_i32_i16 s0, 0 +; GISEL-VI-NEXT: s_sext_i32_i16 s1, 0xff ; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 +; GISEL-VI-NEXT: s_sext_i32_i16 s2, s6 +; GISEL-VI-NEXT: s_sext_i32_i16 s3, s7 +; GISEL-VI-NEXT: s_max_i32 s2, s2, s0 +; GISEL-VI-NEXT: s_max_i32 s0, s3, s0 +; GISEL-VI-NEXT: s_sext_i32_i16 s0, s0 ; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_max_i32 s3, s3, s4 -; GISEL-VI-NEXT: s_max_i32 s2, s2, s4 -; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-VI-NEXT: s_min_i32 s3, s3, s5 -; GISEL-VI-NEXT: s_min_i32 s2, s2, s5 -; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3 -; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2 -; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16 -; GISEL-VI-NEXT: s_or_b32 s2, s2, s3 -; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0 -; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-VI-NEXT: s_min_i32 s0, s0, s1 +; GISEL-VI-NEXT: s_min_i32 s2, s2, s1 +; GISEL-VI-NEXT: s_and_b32 s0, 0xffff, s0 +; GISEL-VI-NEXT: s_and_b32 s1, 0xffff, s2 +; GISEL-VI-NEXT: s_lshl_b32 s0, s0, 16 +; GISEL-VI-NEXT: s_or_b32 s0, s1, s0 +; GISEL-VI-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-VI-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-VI-NEXT: flat_store_dword v[0:1], v2 ; GISEL-VI-NEXT: s_endpgm ; ; GISEL-GFX9-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX9: ; %bb.0: -; GISEL-GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s4, 0 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s5, 0xff +; GISEL-GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, 0 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s1, 0xff ; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s6 +; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s7 +; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s0 +; GISEL-GFX9-NEXT: s_max_i32 s0, s3, s0 ; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX9-NEXT: s_max_i32 s2, s2, s4 -; GISEL-GFX9-NEXT: s_max_i32 s3, s3, s4 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX9-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s5 -; GISEL-GFX9-NEXT: s_min_i32 s3, s3, s5 -; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GISEL-GFX9-NEXT: s_sext_i32_i16 s0, s0 +; GISEL-GFX9-NEXT: s_min_i32 s2, s2, s1 +; GISEL-GFX9-NEXT: s_min_i32 s0, s0, s1 +; GISEL-GFX9-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX9-NEXT: global_store_dword v1, v0, s[4:5] ; GISEL-GFX9-NEXT: s_endpgm ; ; GISEL-GFX11-LABEL: basic_smax_smin_sgpr: ; GISEL-GFX11: ; %bb.0: -; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, 0 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s5, 0xff +; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s0, 0 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s1, 0xff ; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, s6 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s7 +; GISEL-GFX11-NEXT: s_max_i32 s2, s2, s0 +; GISEL-GFX11-NEXT: s_max_i32 s0, s3, s0 ; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX11-NEXT: s_max_i32 s2, s2, s4 -; GISEL-GFX11-NEXT: s_max_i32 s3, s3, s4 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s2, s2 -; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, s3 -; GISEL-GFX11-NEXT: s_min_i32 s2, s2, s5 -; GISEL-GFX11-NEXT: s_min_i32 s3, s3, s5 +; GISEL-GFX11-NEXT: s_sext_i32_i16 s0, s0 +; GISEL-GFX11-NEXT: s_min_i32 s2, s2, s1 +; GISEL-GFX11-NEXT: s_min_i32 s0, s0, s1 ; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s3 -; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s0, s2, s0 +; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5] ; GISEL-GFX11-NEXT: s_nop 0 ; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GISEL-GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 901e88a..e12a4be 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1344,40 +1344,40 @@ exit: define amdgpu_kernel void @fdiv_f32(ptr addrspace(1) %out, float %a, float %b) #0 { ; GFX1032-LABEL: fdiv_f32: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_div_scale_f32 v0, s4, s3, s3, s2 +; GFX1032-NEXT: v_div_scale_f32 v0, s0, s7, s7, s6 ; GFX1032-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1032-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s2, s3, s2 +; GFX1032-NEXT: v_div_scale_f32 v2, vcc_lo, s6, s7, s6 ; GFX1032-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1032-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1032-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1032-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1032-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX1032-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1032-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX1032-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: fdiv_f32: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_div_scale_f32 v0, s[4:5], s3, s3, s2 +; GFX1064-NEXT: v_div_scale_f32 v0, s[0:1], s7, s7, s6 ; GFX1064-NEXT: v_rcp_f32_e32 v1, v0 ; GFX1064-NEXT: v_fma_f32 v2, -v0, v1, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v1, v2, v1 -; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s2, s3, s2 +; GFX1064-NEXT: v_div_scale_f32 v2, vcc, s6, s7, s6 ; GFX1064-NEXT: v_mul_f32_e32 v3, v2, v1 ; GFX1064-NEXT: v_fma_f32 v4, -v0, v3, v2 ; GFX1064-NEXT: v_fmac_f32_e32 v3, v4, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v0, v3, v2 ; GFX1064-NEXT: v_div_fmas_f32 v0, v0, v1, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s3, s2 -; GFX1064-NEXT: global_store_dword v1, v0, s[0:1] +; GFX1064-NEXT: v_div_fixup_f32 v0, v0, s7, s6 +; GFX1064-NEXT: global_store_dword v1, v0, s[4:5] ; GFX1064-NEXT: s_endpgm entry: %fdiv = fdiv float %a, %b @@ -2138,23 +2138,23 @@ main_body: define amdgpu_kernel void @test_intr_fcmp_i64(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i64: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GFX1032-NEXT: v_mov_b32_e32 v0, s2 -; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GFX1032-NEXT: v_mov_b32_e32 v0, s0 +; GFX1032-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i64: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| -; GFX1064-NEXT: v_mov_b32_e32 v0, s2 -; GFX1064-NEXT: v_mov_b32_e32 v1, s3 -; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| +; GFX1064-NEXT: v_mov_b32_e32 v0, s0 +; GFX1064-NEXT: v_mov_b32_e32 v1, s1 +; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i64 @llvm.amdgcn.fcmp.i64.f32(float %src, float %temp, i32 1) @@ -2195,22 +2195,22 @@ define amdgpu_kernel void @test_intr_icmp_i64(ptr addrspace(1) %out, i32 %src) { define amdgpu_kernel void @test_intr_fcmp_i32(ptr addrspace(1) %out, float %src, float %a) { ; GFX1032-LABEL: test_intr_fcmp_i32: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_eq_f32_e64 s2, s2, |s3| -; GFX1032-NEXT: v_mov_b32_e32 v1, s2 -; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1032-NEXT: v_cmp_eq_f32_e64 s0, s6, |s7| +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_intr_fcmp_i32: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, |s3| -; GFX1064-NEXT: v_mov_b32_e32 v1, s2 -; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] +; GFX1064-NEXT: v_cmp_eq_f32_e64 s[0:1], s6, |s7| +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: global_store_dword v0, v1, s[4:5] ; GFX1064-NEXT: s_endpgm %temp = call float @llvm.fabs.f32(float %a) %result = call i32 @llvm.amdgcn.fcmp.i32.f32(float %src, float %temp, i32 1) diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index e15fd7f..5422bfa 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -276,12 +276,12 @@ define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; ; VI-LABEL: scalar_xor_i32: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s2, s2, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: s_xor_b32 s0, s6, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %result = xor i32 %a, %b -- cgit v1.1