diff options
author | Sergei Barannikov <barannikov88@gmail.com> | 2025-02-01 20:40:50 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2025-02-01 20:40:50 +0300 |
commit | ff9c041d96afdf378d11c14bea60de8437f4fbcc (patch) | |
tree | 4af3083e3db50ff6bf4ce3e6575b0ce522ff0cdf | |
parent | 15336823adbd41d401185c2fecf2c063f6a64f73 (diff) | |
download | llvm-ff9c041d96afdf378d11c14bea60de8437f4fbcc.zip llvm-ff9c041d96afdf378d11c14bea60de8437f4fbcc.tar.gz llvm-ff9c041d96afdf378d11c14bea60de8437f4fbcc.tar.bz2 |
[MachineScheduler] Fix physreg dependencies of ExitSU (#123541)
Providing the correct operand index allows addPhysRegDataDeps to compute
the correct latency.
Pull Request: https://github.com/llvm/llvm-project/pull/123541
58 files changed, 1302 insertions, 1229 deletions
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp index cc98c52..a268047 100644 --- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -209,13 +209,25 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() { ExitSU.setInstr(ExitMI); // Add dependencies on the defs and uses of the instruction. if (ExitMI) { + const MCInstrDesc &MIDesc = ExitMI->getDesc(); for (const MachineOperand &MO : ExitMI->all_uses()) { + unsigned OpIdx = MO.getOperandNo(); Register Reg = MO.getReg(); if (Reg.isPhysical()) { + // addPhysRegDataDeps uses the provided operand index to retrieve + // the operand use cycle from the scheduling model. If the operand + // is "fake" (e.g., an operand of a call instruction used to pass + // an argument to the called function.), the scheduling model may not + // have an entry for it. If this is the case, pass -1 as operand index, + // which will cause addPhysRegDataDeps to add an artificial dependency. + // FIXME: Using hasImplicitUseOfPhysReg here is inaccurate as it misses + // aliases. When fixing, make sure to update addPhysRegDataDeps, too. + bool IsRealUse = OpIdx < MIDesc.getNumOperands() || + MIDesc.hasImplicitUseOfPhysReg(Reg); for (MCRegUnit Unit : TRI->regunits(Reg)) - Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit)); + Uses.insert(PhysRegSUOper(&ExitSU, IsRealUse ? OpIdx : -1, Unit)); } else if (Reg.isVirtual() && MO.readsReg()) { - addVRegUseDeps(&ExitSU, MO.getOperandNo()); + addVRegUseDeps(&ExitSU, OpIdx); } } } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll index c477732..a91e41e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -15,12 +15,12 @@ define ptr addrspace(1) @call_assert_align() { ; CHECK-NEXT: v_writelane_b32 v40, s16, 2 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, ext@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, ext@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off @@ -45,11 +45,11 @@ define ptr addrspace(1) @tail_call_assert_align() { ; CHECK-LABEL: tail_call_assert_align: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, ext@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, ext@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: s_setpc_b64 s[16:17] entry: %call = tail call align 4 ptr addrspace(1) @ext(ptr addrspace(1) null) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 410d3b1..c99424fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -44,8 +44,8 @@ define amdgpu_kernel void @kernel_caller_stack() { ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 12 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 -; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_add_u32 s2, s32, 16 +; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 @@ -239,11 +239,11 @@ define void @func_caller_stack() { ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; MUBUF-NEXT: v_mov_b32_e32 v0, 12 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 @@ -274,15 +274,15 @@ define void @func_caller_stack() { ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 12 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 11 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 16 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 12 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 @@ -312,10 +312,10 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_writelane_b32 v40, s4, 2 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_waitcnt vmcnt(1) ; MUBUF-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; MUBUF-NEXT: s_waitcnt vmcnt(1) @@ -394,8 +394,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) { ; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0 ; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 ; FLATSCR-NEXT: s_add_u32 s0, s32, 8 -; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: s_add_u32 s2, s32, 56 +; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s32 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll index 935200d..91e16d9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll @@ -191,9 +191,9 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: s_mov_b32 s6, -1 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX10-NEXT: s_cbranch_execz .LBB3_6 ; GFX10-NEXT: ; %bb.1: ; %loop.start.preheader diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll index 191739b..951182f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.ll @@ -387,8 +387,8 @@ define amdgpu_ps void @and_i1_scc(i32 inreg %a, i32 inreg %b, ptr addrspace(1) % define amdgpu_ps void @divergent_phi_with_uniform_inputs(i32 %a, ptr addrspace(1) %out) { ; OLD_RBS-LABEL: divergent_phi_with_uniform_inputs: ; OLD_RBS: ; %bb.0: ; %A -; OLD_RBS-NEXT: s_mov_b32 s0, 0 ; OLD_RBS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; OLD_RBS-NEXT: s_mov_b32 s0, 0 ; OLD_RBS-NEXT: s_and_saveexec_b32 s1, vcc_lo ; OLD_RBS-NEXT: ; %bb.1: ; %B ; OLD_RBS-NEXT: s_mov_b32 s0, 1 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 194a23f..b7fbb8b 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -25,10 +25,10 @@ define void @parent_func_missing_inputs() #0 { ; FIXEDABI-NEXT: v_writelane_b32 v40, s16, 2 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 -; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1 ; FIXEDABI-NEXT: s_getpc_b64 s[16:17] ; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs@rel32@lo+4 ; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs@rel32@hi+12 +; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1 ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 @@ -49,21 +49,21 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-SDAG: ; %bb.0: ; FIXEDABI-SDAG-NEXT: s_add_i32 s4, s4, s9 ; FIXEDABI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; FIXEDABI-SDAG-NEXT: s_add_u32 s0, s0, s9 +; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0 ; FIXEDABI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; FIXEDABI-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s14, s8 +; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5] +; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 +; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 ; FIXEDABI-SDAG-NEXT: v_or_b32_e32 v31, v0, v2 ; FIXEDABI-SDAG-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-SDAG-NEXT: s_mov_b32 s32, 0 -; FIXEDABI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s5 -; FIXEDABI-SDAG-NEXT: s_getpc_b64 s[4:5] -; FIXEDABI-SDAG-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 -; FIXEDABI-SDAG-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 ; FIXEDABI-SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5] ; FIXEDABI-SDAG-NEXT: s_endpgm ; @@ -71,21 +71,21 @@ define amdgpu_kernel void @parent_kernel_missing_inputs() #0 { ; FIXEDABI-GISEL: ; %bb.0: ; FIXEDABI-GISEL-NEXT: s_add_i32 s4, s4, s9 ; FIXEDABI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; FIXEDABI-GISEL-NEXT: s_add_u32 s0, s0, s9 +; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 ; FIXEDABI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2 -; FIXEDABI-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s14, s8 ; FIXEDABI-GISEL-NEXT: v_or_b32_e32 v31, v0, v1 +; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5] +; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 +; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 ; FIXEDABI-GISEL-NEXT: s_mov_b64 s[8:9], 0 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s12, s6 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s13, s7 ; FIXEDABI-GISEL-NEXT: s_mov_b32 s32, 0 -; FIXEDABI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s5 -; FIXEDABI-GISEL-NEXT: s_getpc_b64 s[4:5] -; FIXEDABI-GISEL-NEXT: s_add_u32 s4, s4, requires_all_inputs@rel32@lo+4 -; FIXEDABI-GISEL-NEXT: s_addc_u32 s5, s5, requires_all_inputs@rel32@hi+12 ; FIXEDABI-GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] ; FIXEDABI-GISEL-NEXT: s_endpgm call void @requires_all_inputs() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 6166c05..8bb8ecb 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1286,9 +1286,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -1412,9 +1412,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -1540,9 +1541,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1232_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -3129,8 +3131,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB5_2 @@ -4839,9 +4841,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB8_2 @@ -4965,9 +4967,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1132_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB8_2 @@ -5093,9 +5096,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1232_DPP-NEXT: s_or_saveexec_b32 s4, -1 ; GFX1232_DPP-NEXT: v_writelane_b32 v3, s5, 16 ; GFX1232_DPP-NEXT: s_mov_b32 exec_lo, s4 +; GFX1232_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: s_mov_b32 s4, s6 ; GFX1232_DPP-NEXT: s_mov_b32 s6, -1 -; GFX1232_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1232_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1232_DPP-NEXT: s_cbranch_execz .LBB8_2 @@ -6715,8 +6719,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace( ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s8, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v7, s7, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s6 -; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032_DPP-NEXT: s_mov_b32 s6, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s8, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB11_2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 8062dbb..3c0646c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -919,9 +919,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -1030,9 +1030,9 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB2_2 @@ -2630,8 +2630,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB6_2 @@ -2812,8 +2812,8 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB6_2 @@ -3301,8 +3301,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() { ; GFX1032_DPP-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3 ; GFX1032_DPP-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mbcnt_lo_u32_b32 v9, exec_lo, 0 +; GFX1032_DPP-NEXT: v_mov_b32_e32 v7, v1 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032_DPP-NEXT: v_mov_b32_e32 v8, v2 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 @@ -4341,9 +4341,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB10_2 @@ -4452,9 +4452,9 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB10_2 @@ -6075,8 +6075,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v8, s6, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v7, s3, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr9_vgpr10 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB14_2 @@ -6257,8 +6257,8 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v7, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr8_vgpr9 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB14_2 @@ -6757,9 +6757,9 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB15_2 @@ -6868,9 +6868,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB15_2 @@ -7464,8 +7465,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB16_2 @@ -7621,8 +7622,8 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB16_2 @@ -8122,9 +8123,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB17_2 @@ -8233,9 +8234,9 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB17_2 @@ -8828,8 +8829,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB18_2 @@ -8985,8 +8986,8 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB18_2 @@ -9486,9 +9487,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB19_2 @@ -9597,9 +9598,9 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB19_2 @@ -10192,8 +10193,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB20_2 @@ -10349,8 +10350,8 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v6, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB20_2 @@ -10849,9 +10850,9 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB21_2 @@ -10960,9 +10961,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB21_2 @@ -11967,8 +11969,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB23_2 @@ -12181,8 +12183,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB23_2 @@ -12682,9 +12684,9 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB24_2 @@ -12793,9 +12795,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB24_2 @@ -13800,8 +13803,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB26_2 @@ -14014,8 +14017,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB26_2 @@ -14516,9 +14519,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB27_2 @@ -14627,9 +14630,9 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB27_2 @@ -15625,8 +15628,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB29_2 @@ -15833,8 +15836,8 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB29_2 @@ -16334,9 +16337,9 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1032_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB30_2 @@ -16445,9 +16448,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: s_or_saveexec_b32 s0, -1 ; GFX1132_DPP-NEXT: v_writelane_b32 v3, s1, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s0 +; GFX1132_DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: s_mov_b32 s0, s2 ; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 -; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr0 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB30_2 @@ -17442,8 +17446,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1032_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1032_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1032_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1032_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1032_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1032_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032_DPP-NEXT: s_cbranch_execz .LBB32_2 @@ -17650,8 +17654,8 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) { ; GFX1132_DPP-NEXT: v_writelane_b32 v5, s3, 16 ; GFX1132_DPP-NEXT: v_writelane_b32 v4, s6, 16 ; GFX1132_DPP-NEXT: s_mov_b32 exec_lo, s2 -; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX1132_DPP-NEXT: s_mov_b32 s2, -1 ; GFX1132_DPP-NEXT: ; implicit-def: $vgpr7_vgpr8 ; GFX1132_DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132_DPP-NEXT: s_cbranch_execz .LBB32_2 diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll index 704b68a..0b8ad35 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll @@ -43,11 +43,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i8_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i8_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -103,11 +103,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -163,11 +163,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -223,12 +223,12 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -284,12 +284,12 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -345,13 +345,13 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s19, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -407,14 +407,14 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s20, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -470,6 +470,9 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s24, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[24:25] +; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s1, s17 @@ -479,9 +482,6 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 { ; GFX9-NEXT: s_mov_b32 s18, s22 ; GFX9-NEXT: s_mov_b32 s19, s23 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[24:25] -; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -537,11 +537,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -597,11 +597,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -657,11 +657,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f32_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f32_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -717,12 +717,12 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -778,11 +778,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0 ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -839,11 +839,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -899,12 +899,12 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -960,12 +960,12 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1021,12 +1021,12 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 { ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1082,12 +1082,12 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1143,11 +1143,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg) ; GFX9-NEXT: v_writelane_b32 v40, s17, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p3_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p3_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1203,14 +1203,14 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre ; GFX9-NEXT: v_writelane_b32 v40, s20, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[20:21] +; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[20:21] -; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1266,12 +1266,12 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre ; GFX9-NEXT: v_writelane_b32 v40, s18, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 -; GFX9-NEXT: s_mov_b32 s1, s17 -; GFX9-NEXT: s_mov_b32 s0, s16 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[18:19] ; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg@rel32@hi+12 +; GFX9-NEXT: s_mov_b32 s1, s17 +; GFX9-NEXT: s_mov_b32 s0, s16 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1327,15 +1327,15 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre ; GFX9-NEXT: v_writelane_b32 v40, s21, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[22:23] +; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s1, s17 ; GFX9-NEXT: s_mov_b32 s0, s16 ; GFX9-NEXT: s_mov_b32 s16, s20 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[22:23] -; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1391,6 +1391,9 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: v_writelane_b32 v40, s29, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 vcc +; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s3, s19 ; GFX9-NEXT: s_mov_b32 s2, s18 ; GFX9-NEXT: s_mov_b32 s1, s17 @@ -1405,9 +1408,6 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) # ; GFX9-NEXT: s_mov_b32 s23, s27 ; GFX9-NEXT: s_mov_b32 s24, s28 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 vcc -; GFX9-NEXT: s_add_u32 vcc_lo, vcc_lo, external_void_func_a15i32_inreg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 vcc_hi, vcc_hi, external_void_func_a15i32_inreg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], vcc ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -1465,6 +1465,9 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre ; GFX9-NEXT: v_writelane_b32 v40, s21, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: s_getpc_b64 s[22:23] +; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12 ; GFX9-NEXT: s_mov_b32 s3, s7 ; GFX9-NEXT: s_mov_b32 s2, s6 ; GFX9-NEXT: s_mov_b32 s1, s5 @@ -1480,9 +1483,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre ; GFX9-NEXT: s_mov_b32 s15, s19 ; GFX9-NEXT: s_mov_b32 s16, s20 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[22:23] -; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 35d0039..16fe85b 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -71,12 +71,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: v_mov_b32_e32 v0, 1 -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 1 +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -90,12 +90,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 1 -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 1 +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -109,24 +109,23 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i1_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 1 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -135,14 +134,14 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: v_mov_b32_e32 v0, 1 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 1 +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i1(i1 true) @@ -164,11 +163,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm @@ -187,11 +186,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: v_bfe_i32 v0, v0, 0, 1 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm @@ -210,11 +209,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm @@ -226,10 +225,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_signext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: v_bfe_i32 v0, v0, 0, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm @@ -244,12 +243,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_signext@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_signext@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm @@ -274,11 +273,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm @@ -297,11 +296,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: v_and_b32_e32 v0, 1, v0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm @@ -320,11 +319,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm @@ -336,10 +335,10 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i1_zeroext@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm @@ -354,12 +353,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i1_zeroext@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i1_zeroext@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: v_and_b32_e32 v0, 1, v0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm @@ -379,12 +378,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: v_mov_b32_e32 v0, 0x7b -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -398,12 +397,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 0x7b -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -417,24 +416,23 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i8_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -444,13 +442,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: v_mov_b32_e32 v0, 0x7b -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i8(i8 123) @@ -473,11 +471,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -495,11 +493,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -517,11 +515,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -532,11 +530,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_i8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_signext@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -550,12 +547,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_signext@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_signext@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef @@ -578,11 +575,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -600,11 +597,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -622,11 +619,11 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -637,11 +634,10 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u8 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i8_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i8_zeroext@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -655,12 +651,12 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i8_zeroext@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i8_zeroext@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i8, ptr addrspace(1) undef @@ -679,12 +675,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: v_mov_b32_e32 v0, 0x7b -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x7b +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -698,12 +694,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 0x7b -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 0x7b +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -717,24 +713,23 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i16_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -743,14 +738,14 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: v_mov_b32_e32 v0, 0x7b -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x7b +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i16(i16 123) @@ -772,11 +767,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -794,11 +789,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -816,11 +811,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -831,11 +826,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_i16 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_signext@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -849,12 +843,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_signext@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_signext@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef @@ -877,11 +871,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -899,11 +893,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -921,11 +915,11 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -936,11 +930,10 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i16_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i16_zeroext@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -954,12 +947,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i16_zeroext@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i16_zeroext@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %var = load volatile i16, ptr addrspace(1) undef @@ -978,12 +971,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -997,12 +990,12 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1016,24 +1009,23 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1043,13 +1035,13 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: v_mov_b32_e32 v0, 42 -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 42 +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i32(i32 42) @@ -1067,13 +1059,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x7b ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1087,13 +1079,13 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0x7b ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1107,25 +1099,24 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_i64_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1134,15 +1125,15 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i64@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x7b ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_i64@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_i64(i64 123) @@ -1165,11 +1156,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1188,11 +1179,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1211,11 +1202,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -1225,13 +1216,12 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 ; GFX11-NEXT: s_mov_b32 s5, s4 -; GFX11-NEXT: s_mov_b32 s32, 0 -; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1245,13 +1235,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 { ; HSA-NEXT: s_mov_b32 s10, -1 ; HSA-NEXT: s_mov_b32 s9, s8 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x i64>, ptr addrspace(1) null @@ -1270,15 +1260,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 ; VI-NEXT: v_mov_b32_e32 v2, 3 ; VI-NEXT: v_mov_b32_e32 v3, 4 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1292,15 +1282,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 ; CI-NEXT: v_mov_b32_e32 v2, 3 ; CI-NEXT: v_mov_b32_e32 v3, 4 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1314,15 +1304,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -1330,12 +1320,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1344,17 +1333,17 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i64@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>) @@ -1377,13 +1366,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v4, 1 ; VI-NEXT: v_mov_b32_e32 v5, 2 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1402,13 +1391,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v4, 1 ; CI-NEXT: v_mov_b32_e32 v5, 2 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1427,13 +1416,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -1445,12 +1434,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; GFX11-NEXT: s_mov_b32 s5, s4 ; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1464,15 +1452,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 { ; HSA-NEXT: s_mov_b32 s10, -1 ; HSA-NEXT: s_mov_b32 s9, s8 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i64@rel32@hi+12 ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i64@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null @@ -1498,15 +1486,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v4, 1 ; VI-NEXT: v_mov_b32_e32 v5, 2 ; VI-NEXT: v_mov_b32_e32 v6, 3 ; VI-NEXT: v_mov_b32_e32 v7, 4 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1525,15 +1513,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v4, 1 ; CI-NEXT: v_mov_b32_e32 v5, 2 ; CI-NEXT: v_mov_b32_e32 v6, 3 ; CI-NEXT: v_mov_b32_e32 v7, 4 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1552,15 +1540,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 ; GFX9-NEXT: v_mov_b32_e32 v6, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 4 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -1573,12 +1561,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; GFX11-NEXT: v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1592,17 +1579,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 { ; HSA-NEXT: s_mov_b32 s10, -1 ; HSA-NEXT: s_mov_b32 s9, s8 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i64@rel32@hi+12 ; HSA-NEXT: v_mov_b32_e32 v4, 1 ; HSA-NEXT: v_mov_b32_e32 v5, 2 ; HSA-NEXT: v_mov_b32_e32 v6, 3 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v7, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i64@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %load = load <2 x i64>, ptr addrspace(1) null @@ -1622,12 +1609,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: v_mov_b32_e32 v0, 0x4400 -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 0x4400 +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1641,12 +1628,12 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 4.0 -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 4.0 +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1660,24 +1647,23 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_f16_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1686,14 +1672,14 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f16@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 0x4400 +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_f16(half 4.0) @@ -1711,12 +1697,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: v_mov_b32_e32 v0, 4.0 -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: v_mov_b32_e32 v0, 4.0 +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1730,12 +1716,12 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: v_mov_b32_e32 v0, 4.0 -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: v_mov_b32_e32 v0, 4.0 +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1749,24 +1735,23 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_f32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1775,14 +1760,14 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: v_mov_b32_e32 v0, 4.0 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: v_mov_b32_e32 v0, 4.0 +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_f32(float 4.0) @@ -1800,13 +1785,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1820,13 +1805,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1840,25 +1825,24 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2f32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1867,15 +1851,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>) @@ -1893,14 +1877,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-NEXT: v_mov_b32_e32 v2, 4.0 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -1914,14 +1898,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 ; CI-NEXT: v_mov_b32_e32 v2, 4.0 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -1935,14 +1919,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -1950,12 +1934,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -1964,16 +1947,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 4.0 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>) @@ -1991,6 +1974,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1.0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -1998,9 +1984,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; VI-NEXT: v_mov_b32_e32 v3, -1.0 ; VI-NEXT: v_mov_b32_e32 v4, 0.5 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2014,6 +1997,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2021,9 +2007,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; CI-NEXT: v_mov_b32_e32 v3, -1.0 ; CI-NEXT: v_mov_b32_e32 v4, 0.5 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -2037,6 +2020,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2044,9 +2030,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2055,12 +2038,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2069,8 +2051,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5f32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1.0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2078,9 +2063,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v3, -1.0 ; HSA-NEXT: v_mov_b32_e32 v4, 0.5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5f32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>) @@ -2098,13 +2080,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40100000 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2118,13 +2100,13 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0x40100000 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -2138,25 +2120,24 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_f64_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_f64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2165,15 +2146,15 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f64@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_f64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_f64@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_f64(double 4.0) @@ -2191,15 +2172,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 ; VI-NEXT: v_mov_b32_e32 v2, 0 ; VI-NEXT: v_mov_b32_e32 v3, 0x40100000 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2213,15 +2194,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 ; CI-NEXT: v_mov_b32_e32 v2, 0 ; CI-NEXT: v_mov_b32_e32 v3, 0x40100000 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -2235,15 +2216,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2251,12 +2232,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2265,17 +2245,17 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 ; HSA-NEXT: v_mov_b32_e32 v3, 0x40100000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f64@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>) @@ -2293,6 +2273,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2301,9 +2284,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; VI-NEXT: v_mov_b32_e32 v4, 0 ; VI-NEXT: v_mov_b32_e32 v5, 0x40200000 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2317,6 +2297,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2325,9 +2308,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: v_mov_b32_e32 v5, 0x40200000 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -2341,6 +2321,9 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2349,9 +2332,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2360,12 +2340,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f64@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2374,8 +2353,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f64@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f64@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 2.0 @@ -2384,9 +2366,6 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v4, 0 ; HSA-NEXT: v_mov_b32_e32 v5, 0x40200000 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f64@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f64@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>) @@ -2407,11 +2386,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2428,11 +2407,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -2451,11 +2430,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2465,11 +2444,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2483,11 +2461,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 { ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i16@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x i16>, ptr addrspace(1) undef @@ -2509,11 +2487,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2530,11 +2508,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_alignbit_b32 v1, v3, v2, 16 ; CI-NEXT: v_mov_b32_e32 v0, v2 @@ -2555,11 +2533,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2569,11 +2547,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2587,11 +2564,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <3 x i16>, ptr addrspace(1) undef @@ -2613,11 +2590,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2634,11 +2611,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -2660,11 +2637,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2674,11 +2651,10 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2692,11 +2668,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 { ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <3 x half>, ptr addrspace(1) undef @@ -2715,13 +2691,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 ; VI-NEXT: v_mov_b32_e32 v1, 3 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2735,14 +2711,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 ; CI-NEXT: v_mov_b32_e32 v2, 3 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -2756,25 +2732,24 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 3 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v3i16_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2783,15 +2758,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 3 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i16@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>) @@ -2809,13 +2784,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; VI-NEXT: v_mov_b32_e32 v1, 0x4400 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2829,14 +2804,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1.0 ; CI-NEXT: v_mov_b32_e32 v1, 2.0 ; CI-NEXT: v_mov_b32_e32 v2, 4.0 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -2850,13 +2825,13 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2864,12 +2839,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2878,15 +2852,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; HSA-NEXT: v_mov_b32_e32 v1, 0x4400 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3f16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3f16@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>) @@ -2907,11 +2881,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -2928,11 +2902,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v0 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v1 @@ -2954,11 +2928,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -2968,11 +2942,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -2986,11 +2959,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <4 x i16>, ptr addrspace(1) undef @@ -3009,13 +2982,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 0x20001 ; VI-NEXT: v_mov_b32_e32 v1, 0x40003 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3029,15 +3002,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 ; CI-NEXT: v_mov_b32_e32 v2, 3 ; CI-NEXT: v_mov_b32_e32 v3, 4 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3051,13 +3024,13 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3065,12 +3038,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3079,15 +3051,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 0x20001 ; HSA-NEXT: v_mov_b32_e32 v1, 0x40003 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i16@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i16@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>) @@ -3108,11 +3080,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3129,11 +3101,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_cvt_f32_f16_e32 v0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 @@ -3154,11 +3126,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3168,11 +3140,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2f16@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3186,11 +3157,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 { ; HSA-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2f16@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2f16@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x half>, ptr addrspace(1) undef @@ -3212,11 +3183,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3233,11 +3204,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3254,11 +3225,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3268,11 +3239,10 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3286,11 +3256,11 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 { ; HSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 ; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <2 x i32>, ptr addrspace(1) undef @@ -3309,13 +3279,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3329,13 +3299,13 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3349,25 +3319,24 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; ; GFX11-LABEL: test_call_external_void_func_v2i32_imm: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3376,15 +3345,15 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v2i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v2i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>) @@ -3402,14 +3371,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: v_mov_b32_e32 v1, 4 ; VI-NEXT: v_mov_b32_e32 v2, 5 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3423,14 +3392,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: v_mov_b32_e32 v1, 4 ; CI-NEXT: v_mov_b32_e32 v2, 5 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3444,14 +3413,14 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3459,12 +3428,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3474,15 +3442,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>) @@ -3500,15 +3468,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 3 ; VI-NEXT: v_mov_b32_e32 v1, 4 ; VI-NEXT: v_mov_b32_e32 v2, 5 ; VI-NEXT: v_mov_b32_e32 v3, 6 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3522,15 +3490,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 3 ; CI-NEXT: v_mov_b32_e32 v1, 4 ; CI-NEXT: v_mov_b32_e32 v2, 5 ; CI-NEXT: v_mov_b32_e32 v3, 6 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3544,15 +3512,15 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, 6 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3560,12 +3528,11 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v3i32_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v3i32_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3575,16 +3542,16 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 { ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 3 ; HSA-NEXT: v_mov_b32_e32 v1, 4 ; HSA-NEXT: v_mov_b32_e32 v2, 5 ; HSA-NEXT: v_mov_b32_e32 v3, 6 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v3i32_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v3i32_i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6) @@ -3605,11 +3572,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3626,11 +3593,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3647,11 +3614,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3661,11 +3628,10 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; GFX11-NEXT: s_mov_b32 s2, -1 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3679,11 +3645,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 ; HSA-NEXT: s_add_u32 s0, s0, s9 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %val = load <4 x i32>, ptr addrspace(1) undef @@ -3702,15 +3668,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 ; VI-NEXT: v_mov_b32_e32 v2, 3 ; VI-NEXT: v_mov_b32_e32 v3, 4 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3724,15 +3690,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 ; CI-NEXT: v_mov_b32_e32 v2, 3 ; CI-NEXT: v_mov_b32_e32 v3, 4 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3746,15 +3712,15 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3762,12 +3728,11 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v4i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3776,17 +3741,17 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 ; HSA-NEXT: v_mov_b32_e32 v2, 3 ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v4i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v4i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>) @@ -3804,6 +3769,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -3811,9 +3779,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; VI-NEXT: v_mov_b32_e32 v3, 4 ; VI-NEXT: v_mov_b32_e32 v4, 5 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3827,6 +3792,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -3834,9 +3802,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; CI-NEXT: v_mov_b32_e32 v3, 4 ; CI-NEXT: v_mov_b32_e32 v4, 5 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3850,6 +3815,9 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 @@ -3857,9 +3825,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: v_mov_b32_e32 v4, 5 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3868,12 +3833,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v5i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v5i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -3882,8 +3846,11 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 @@ -3891,9 +3858,6 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v3, 4 ; HSA-NEXT: v_mov_b32_e32 v4, 5 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v5i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v5i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>) @@ -3917,11 +3881,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -3941,11 +3905,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -3965,11 +3929,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -3978,10 +3942,10 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 @@ -3998,16 +3962,16 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 { ; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; HSA-NEXT: s_mov_b32 s11, 0x1100f000 ; HSA-NEXT: s_mov_b32 s10, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef @@ -4027,6 +3991,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: v_mov_b32_e32 v0, 1 ; VI-NEXT: v_mov_b32_e32 v1, 2 @@ -4037,9 +4004,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; VI-NEXT: v_mov_b32_e32 v6, 7 ; VI-NEXT: v_mov_b32_e32 v7, 8 ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -4053,6 +4017,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: v_mov_b32_e32 v0, 1 ; CI-NEXT: v_mov_b32_e32 v1, 2 @@ -4063,9 +4030,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; CI-NEXT: v_mov_b32_e32 v6, 7 ; CI-NEXT: v_mov_b32_e32 v7, 8 ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -4079,6 +4043,9 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 @@ -4089,9 +4056,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GFX9-NEXT: v_mov_b32_e32 v6, 7 ; GFX9-NEXT: v_mov_b32_e32 v7, 8 ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -4101,12 +4065,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v8i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v8i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -4115,8 +4078,11 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-NEXT: s_add_i32 s6, s6, s9 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8 ; HSA-NEXT: s_add_u32 s0, s0, s9 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 1 ; HSA-NEXT: v_mov_b32_e32 v1, 2 @@ -4127,9 +4093,6 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { ; HSA-NEXT: v_mov_b32_e32 v6, 7 ; HSA-NEXT: v_mov_b32_e32 v7, 8 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v8i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v8i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>) @@ -4155,11 +4118,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -4181,11 +4144,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; CI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -4207,11 +4170,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -4220,10 +4183,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 @@ -4242,18 +4205,18 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 { ; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; HSA-NEXT: s_mov_b32 s11, 0x1100f000 ; HSA-NEXT: s_mov_b32 s10, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 ; HSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; HSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32 ; HSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr = load ptr addrspace(1), ptr addrspace(4) undef @@ -4286,10 +4249,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_getpc_b64 s[8:9] ; VI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(7) ; VI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; VI-NEXT: s_swappc_b64 s[30:31], s[8:9] @@ -4318,10 +4281,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_getpc_b64 s[8:9] ; CI-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(7) ; CI-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; CI-NEXT: s_swappc_b64 s[30:31], s[8:9] @@ -4350,10 +4313,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_getpc_b64 s[8:9] ; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt vmcnt(7) ; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9] @@ -4392,7 +4355,7 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; HSA-NEXT: s_mov_b32 s11, 0x1100f000 ; HSA-NEXT: s_mov_b32 s10, -1 -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112 ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 @@ -4402,12 +4365,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_getpc_b64 s[12:13] ; HSA-NEXT: s_add_u32 s12, s12, external_void_func_v32i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s13, s13, external_void_func_v32i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(7) ; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 ; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13] @@ -4443,10 +4406,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_mov_b32 s32, 0 -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(8) ; VI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(8) @@ -4478,10 +4441,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_mov_b32 s32, 0 -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(8) ; CI-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; CI-NEXT: s_waitcnt vmcnt(8) @@ -4513,10 +4476,10 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_mov_b32 s32, 0 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt vmcnt(8) ; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(8) @@ -4572,12 +4535,12 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 { ; HSA-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64 ; HSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80 ; HSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96 -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v32i32_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(8) ; HSA-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_waitcnt vmcnt(8) @@ -4603,14 +4566,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; VI-NEXT: s_addc_u32 s41, s41, 0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[40:41] +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[42:43] ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_mov_b32 s39, 0xf000 ; VI-NEXT: s_mov_b32 s38, -1 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -4627,14 +4590,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; CI-NEXT: s_addc_u32 s41, s41, 0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[40:41] +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[42:43] ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_mov_b32 s39, 0xf000 ; CI-NEXT: s_mov_b32 s38, -1 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; CI-NEXT: s_waitcnt vmcnt(0) @@ -4651,14 +4614,14 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; GFX9-NEXT: s_addc_u32 s41, s41, 0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41] +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43] ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_mov_b32 s39, 0xf000 ; GFX9-NEXT: s_mov_b32 s38, -1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -4668,14 +4631,13 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[36:37], s[2:3], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 +; GFX11-NEXT: s_getpc_b64 s[2:3] +; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32@rel32@hi+12 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_mov_b32 s39, 0x31016000 ; GFX11-NEXT: s_mov_b32 s38, -1 -; GFX11-NEXT: s_getpc_b64 s[2:3] -; GFX11-NEXT: s_add_u32 s2, s2, external_i32_func_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s3, s3, external_i32_func_i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4684,19 +4646,19 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) ; HSA-LABEL: test_call_external_i32_func_i32_imm: ; HSA: ; %bb.0: ; HSA-NEXT: s_add_i32 s8, s8, s11 -; HSA-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0 ; HSA-NEXT: s_lshr_b32 flat_scratch_hi, s8, 8 +; HSA-NEXT: s_load_dwordx2 s[36:37], s[6:7], 0x0 ; HSA-NEXT: s_add_u32 s0, s0, s11 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 +; HSA-NEXT: s_getpc_b64 s[8:9] +; HSA-NEXT: s_add_u32 s8, s8, external_i32_func_i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s9, s9, external_i32_func_i32@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, 42 ; HSA-NEXT: s_mov_b32 s32, 0 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_mov_b32 s39, 0x1100f000 ; HSA-NEXT: s_mov_b32 s38, -1 -; HSA-NEXT: s_getpc_b64 s[8:9] -; HSA-NEXT: s_add_u32 s8, s8, external_i32_func_i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s9, s9, external_i32_func_i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: buffer_store_dword v0, off, s[36:39], 0 ; HSA-NEXT: s_waitcnt vmcnt(0) @@ -4723,11 +4685,11 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; VI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -4747,11 +4709,11 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CI-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -4771,11 +4733,11 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -4784,10 +4746,10 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: buffer_load_u8 v0, off, s[4:7], 0 @@ -4804,16 +4766,16 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; HSA-NEXT: s_mov_b32 s11, 0x1100f000 ; HSA-NEXT: s_mov_b32 s10, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 +; HSA-NEXT: s_addc_u32 s1, s1, 0 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 ; HSA-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:4 -; HSA-NEXT: s_addc_u32 s1, s1, 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9] ; HSA-NEXT: s_endpgm %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef @@ -4840,10 +4802,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] ; VI-NEXT: s_movk_i32 s32, 0x400 -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -4868,10 +4830,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] ; CI-NEXT: s_movk_i32 s32, 0x400 -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; CI-NEXT: s_waitcnt vmcnt(1) @@ -4897,10 +4859,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX9-NEXT: s_movk_i32 s32, 0x400 -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -4912,10 +4874,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: s_mov_b32 s32, 16 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, off ; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 @@ -4939,10 +4901,10 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x400 ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_waitcnt vmcnt(1) @@ -4976,10 +4938,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; VI-NEXT: s_movk_i32 s32, 0x800 ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; VI-NEXT: s_waitcnt vmcnt(1) @@ -5014,10 +4976,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; CI-NEXT: s_movk_i32 s32, 0x800 ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] ; CI-NEXT: s_waitcnt vmcnt(1) ; CI-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; CI-NEXT: s_waitcnt vmcnt(1) @@ -5053,10 +5015,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX9-NEXT: s_movk_i32 s32, 0x800 ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(1) @@ -5078,10 +5040,10 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; GFX11: ; %bb.0: ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: s_mov_b32 s32, 32 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, off ; GFX11-NEXT: scratch_store_b32 off, v1, off offset:4 @@ -5118,11 +5080,11 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval ; HSA-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; HSA-NEXT: buffer_load_dword v1, off, s[0:3], 0 ; HSA-NEXT: s_movk_i32 s32, 0x800 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_mov_b32 flat_scratch_lo, s9 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: s_waitcnt vmcnt(1) ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; HSA-NEXT: s_waitcnt vmcnt(1) @@ -5172,11 +5134,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_addc_u32 s37, s37, 0 ; VI-NEXT: s_mov_b64 s[0:1], s[36:37] -; VI-NEXT: s_mov_b64 s[2:3], s[38:39] -; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; VI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; VI-NEXT: s_mov_b64 s[2:3], s[38:39] +; VI-NEXT: s_mov_b32 s32, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; VI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -5214,11 +5176,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; CI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_addc_u32 s37, s37, 0 ; CI-NEXT: s_mov_b64 s[0:1], s[36:37] -; CI-NEXT: s_mov_b64 s[2:3], s[38:39] -; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; CI-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; CI-NEXT: s_mov_b64 s[2:3], s[38:39] +; CI-NEXT: s_mov_b32 s32, 0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; CI-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -5256,11 +5218,11 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 ; GFX9-NEXT: s_addc_u32 s37, s37, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37] -; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] -; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12 +; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX9-NEXT: s_mov_b32 s32, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -5288,10 +5250,10 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v16i8@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v16i8@rel32@hi+12 +; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[4:7], 0 ; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] @@ -5323,15 +5285,15 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 { ; HSA-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; HSA-NEXT: s_mov_b32 s11, 0x1100f000 ; HSA-NEXT: s_mov_b32 s10, -1 -; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_addc_u32 s1, s1, 0 +; HSA-NEXT: s_mov_b32 flat_scratch_lo, s7 ; HSA-NEXT: s_waitcnt lgkmcnt(0) ; HSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 -; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] -; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_getpc_b64 s[8:9] ; HSA-NEXT: s_add_u32 s8, s8, external_void_func_v16i8@rel32@lo+4 ; HSA-NEXT: s_addc_u32 s9, s9, external_void_func_v16i8@rel32@hi+12 +; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] +; HSA-NEXT: s_mov_b32 s32, 0 ; HSA-NEXT: s_waitcnt vmcnt(0) ; HSA-NEXT: v_lshrrev_b32_e32 v16, 8, v0 ; HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0 @@ -5381,6 +5343,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: s_mov_b64 s[6:7], s[0:1] ; VI-NEXT: s_mov_b64 s[0:1], s[52:53] ; VI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 ; VI-NEXT: s_mov_b64 s[2:3], s[54:55] ; VI-NEXT: v_mov_b32_e32 v0, s36 ; VI-NEXT: v_mov_b32_e32 v1, s37 @@ -5413,9 +5378,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; VI-NEXT: v_mov_b32_e32 v28, s20 ; VI-NEXT: v_mov_b32_e32 v29, s21 ; VI-NEXT: v_mov_b32_e32 v30, s22 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: s_endpgm ; @@ -5440,6 +5402,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: s_mov_b64 s[6:7], s[0:1] ; CI-NEXT: s_mov_b64 s[0:1], s[52:53] ; CI-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 ; CI-NEXT: s_mov_b64 s[2:3], s[54:55] ; CI-NEXT: v_mov_b32_e32 v0, s36 ; CI-NEXT: v_mov_b32_e32 v1, s37 @@ -5472,9 +5437,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; CI-NEXT: v_mov_b32_e32 v28, s20 ; CI-NEXT: v_mov_b32_e32 v29, s21 ; CI-NEXT: v_mov_b32_e32 v30, s22 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: s_endpgm ; @@ -5499,6 +5461,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53] ; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 ; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55] ; GFX9-NEXT: v_mov_b32_e32 v0, s36 ; GFX9-NEXT: v_mov_b32_e32 v1, s37 @@ -5531,9 +5496,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX9-NEXT: v_mov_b32_e32 v28, s20 ; GFX9-NEXT: v_mov_b32_e32 v29, s21 ; GFX9-NEXT: v_mov_b32_e32 v30, s22 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -5569,11 +5531,10 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; GFX11-NEXT: v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14 ; GFX11-NEXT: v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16 ; GFX11-NEXT: v_mov_b32_e32 v30, s18 -; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_getpc_b64 s[2:3] ; GFX11-NEXT: s_add_u32 s2, s2, stack_passed_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s3, s3, stack_passed_f64_arg@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b64 s[6:7], s[0:1] ; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3] ; GFX11-NEXT: s_endpgm ; @@ -5595,6 +5556,9 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; HSA-NEXT: v_mov_b32_e32 v0, s25 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 +; HSA-NEXT: s_getpc_b64 s[24:25] +; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg@rel32@hi+12 ; HSA-NEXT: s_mov_b64 s[6:7], s[4:5] ; HSA-NEXT: v_mov_b32_e32 v0, s36 ; HSA-NEXT: v_mov_b32_e32 v1, s37 @@ -5627,9 +5591,6 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val ; HSA-NEXT: v_mov_b32_e32 v28, s20 ; HSA-NEXT: v_mov_b32_e32 v29, s21 ; HSA-NEXT: v_mov_b32_e32 v30, s22 -; HSA-NEXT: s_getpc_b64 s[24:25] -; HSA-NEXT: s_add_u32 s24, s24, stack_passed_f64_arg@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s25, s25, stack_passed_f64_arg@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[24:25] ; HSA-NEXT: s_endpgm entry: @@ -5835,6 +5796,9 @@ define void @stack_12xv3i32() #0 { ; VI-NEXT: v_mov_b32_e32 v0, 15 ; VI-NEXT: v_writelane_b32 v40, s30, 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -5867,9 +5831,6 @@ define void @stack_12xv3i32() #0 { ; VI-NEXT: v_mov_b32_e32 v29, 9 ; VI-NEXT: v_mov_b32_e32 v30, 10 ; VI-NEXT: v_writelane_b32 v40, s31, 1 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 @@ -5903,6 +5864,9 @@ define void @stack_12xv3i32() #0 { ; CI-NEXT: v_mov_b32_e32 v0, 15 ; CI-NEXT: v_writelane_b32 v40, s30, 0 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -5935,9 +5899,6 @@ define void @stack_12xv3i32() #0 { ; CI-NEXT: v_mov_b32_e32 v29, 9 ; CI-NEXT: v_mov_b32_e32 v30, 10 ; CI-NEXT: v_writelane_b32 v40, s31, 1 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 @@ -5971,6 +5932,9 @@ define void @stack_12xv3i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -6003,9 +5967,6 @@ define void @stack_12xv3i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -6051,12 +6012,12 @@ define void @stack_12xv3i32() #0 { ; GFX11-NEXT: v_dual_mov_b32 v27, 9 :: v_dual_mov_b32 v26, 8 ; GFX11-NEXT: v_dual_mov_b32 v29, 9 :: v_dual_mov_b32 v28, 9 ; GFX11-NEXT: v_mov_b32_e32 v30, 10 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -6089,6 +6050,9 @@ define void @stack_12xv3i32() #0 { ; HSA-NEXT: v_mov_b32_e32 v0, 15 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -6121,9 +6085,6 @@ define void @stack_12xv3i32() #0 { ; HSA-NEXT: v_mov_b32_e32 v29, 9 ; HSA-NEXT: v_mov_b32_e32 v30, 10 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 @@ -6174,6 +6135,9 @@ define void @stack_12xv3f32() #0 { ; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; VI-NEXT: v_writelane_b32 v40, s30, 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -6206,9 +6170,6 @@ define void @stack_12xv3f32() #0 { ; VI-NEXT: v_mov_b32_e32 v29, 0x41100000 ; VI-NEXT: v_mov_b32_e32 v30, 0x41200000 ; VI-NEXT: v_writelane_b32 v40, s31, 1 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 @@ -6242,6 +6203,9 @@ define void @stack_12xv3f32() #0 { ; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; CI-NEXT: v_writelane_b32 v40, s30, 0 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -6274,9 +6238,6 @@ define void @stack_12xv3f32() #0 { ; CI-NEXT: v_mov_b32_e32 v29, 0x41100000 ; CI-NEXT: v_mov_b32_e32 v30, 0x41200000 ; CI-NEXT: v_writelane_b32 v40, s31, 1 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 @@ -6310,6 +6271,9 @@ define void @stack_12xv3f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -6342,9 +6306,6 @@ define void @stack_12xv3f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v29, 0x41100000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -6394,12 +6355,12 @@ define void @stack_12xv3f32() #0 { ; GFX11-NEXT: v_dual_mov_b32 v27, 0x41100000 :: v_dual_mov_b32 v28, 0x41100000 ; GFX11-NEXT: v_mov_b32_e32 v29, 0x41100000 ; GFX11-NEXT: v_mov_b32_e32 v30, 0x41200000 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_12xv3f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -6432,6 +6393,9 @@ define void @stack_12xv3f32() #0 { ; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -6464,9 +6428,6 @@ define void @stack_12xv3f32() #0 { ; HSA-NEXT: v_mov_b32_e32 v29, 0x41100000 ; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_12xv3f32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 @@ -6525,6 +6486,9 @@ define void @stack_8xv5i32() #0 { ; VI-NEXT: v_mov_b32_e32 v0, 15 ; VI-NEXT: v_writelane_b32 v40, s30, 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -6557,9 +6521,6 @@ define void @stack_8xv5i32() #0 { ; VI-NEXT: v_mov_b32_e32 v29, 5 ; VI-NEXT: v_mov_b32_e32 v30, 6 ; VI-NEXT: v_writelane_b32 v40, s31, 1 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 @@ -6601,6 +6562,9 @@ define void @stack_8xv5i32() #0 { ; CI-NEXT: v_mov_b32_e32 v0, 15 ; CI-NEXT: v_writelane_b32 v40, s30, 0 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -6633,9 +6597,6 @@ define void @stack_8xv5i32() #0 { ; CI-NEXT: v_mov_b32_e32 v29, 5 ; CI-NEXT: v_mov_b32_e32 v30, 6 ; CI-NEXT: v_writelane_b32 v40, s31, 1 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 @@ -6677,6 +6638,9 @@ define void @stack_8xv5i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -6709,9 +6673,6 @@ define void @stack_8xv5i32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -6762,12 +6723,12 @@ define void @stack_8xv5i32() #0 { ; GFX11-NEXT: v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v29, 5 ; GFX11-NEXT: v_mov_b32_e32 v28, 5 ; GFX11-NEXT: v_mov_b32_e32 v30, 6 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5i32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -6808,6 +6769,9 @@ define void @stack_8xv5i32() #0 { ; HSA-NEXT: v_mov_b32_e32 v0, 15 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -6840,9 +6804,6 @@ define void @stack_8xv5i32() #0 { ; HSA-NEXT: v_mov_b32_e32 v29, 5 ; HSA-NEXT: v_mov_b32_e32 v30, 6 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5i32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 @@ -6897,6 +6858,9 @@ define void @stack_8xv5f32() #0 { ; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; VI-NEXT: v_writelane_b32 v40, s30, 0 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; VI-NEXT: s_getpc_b64 s[4:5] +; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_mov_b32_e32 v2, 0 @@ -6929,9 +6893,6 @@ define void @stack_8xv5f32() #0 { ; VI-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; VI-NEXT: v_writelane_b32 v40, s31, 1 -; VI-NEXT: s_getpc_b64 s[4:5] -; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 -; VI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 @@ -6973,6 +6934,9 @@ define void @stack_8xv5f32() #0 { ; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; CI-NEXT: v_writelane_b32 v40, s30, 0 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; CI-NEXT: s_getpc_b64 s[4:5] +; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; CI-NEXT: v_mov_b32_e32 v0, 0 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_mov_b32_e32 v2, 0 @@ -7005,9 +6969,6 @@ define void @stack_8xv5f32() #0 { ; CI-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; CI-NEXT: v_writelane_b32 v40, s31, 1 -; CI-NEXT: s_getpc_b64 s[4:5] -; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 -; CI-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 @@ -7049,6 +7010,9 @@ define void @stack_8xv5f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; GFX9-NEXT: s_getpc_b64 s[4:5] +; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 @@ -7081,9 +7045,6 @@ define void @stack_8xv5f32() #0 { ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 -; GFX9-NEXT: s_getpc_b64 s[4:5] -; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 -; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 @@ -7137,12 +7098,12 @@ define void @stack_8xv5f32() #0 { ; GFX11-NEXT: v_dual_mov_b32 v27, 0x40a00000 :: v_dual_mov_b32 v26, 0x40a00000 ; GFX11-NEXT: v_dual_mov_b32 v29, 0x40a00000 :: v_dual_mov_b32 v28, 0x40a00000 ; GFX11-NEXT: v_mov_b32_e32 v30, 0x40c00000 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_8xv5f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_8xv5f32@rel32@hi+12 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_mov_b32 s32, s33 @@ -7183,6 +7144,9 @@ define void @stack_8xv5f32() #0 { ; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 +; HSA-NEXT: s_getpc_b64 s[4:5] +; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 +; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; HSA-NEXT: v_mov_b32_e32 v0, 0 ; HSA-NEXT: v_mov_b32_e32 v1, 0 ; HSA-NEXT: v_mov_b32_e32 v2, 0 @@ -7215,9 +7179,6 @@ define void @stack_8xv5f32() #0 { ; HSA-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 -; HSA-NEXT: s_getpc_b64 s[4:5] -; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 -; HSA-NEXT: s_addc_u32 s5, s5, external_void_func_8xv5f32@rel32@hi+12 ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index c7f9ff8..ff80e05 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -102,10 +102,10 @@ define hidden void @void_func_void_clobber_vcc() #2 { } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc: -; GCN: s_mov_b64 s[34:35], vcc -; GCN-NEXT: s_getpc_b64 +; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 +; GCN: s_mov_b64 s[34:35], vcc ; GCN-NEXT: s_swappc_b64 ; GCN: s_mov_b64 vcc, s[34:35] define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 { @@ -142,21 +142,27 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace ; FIXME: What is the expected behavior for reserved registers here? ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; GCN: #ASMSTART -; GCN-NEXT: ; def s33 -; GCN-NEXT: #ASMEND ; FLATSCR: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; MUBUF: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 -; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] + +; GCN: #ASMSTART +; GCN-NEXT: ; def s33 +; GCN-NEXT: #ASMEND + +; GCN-NOT: s33 + +; FLATSCR: s_swappc_b64 s[30:31], s[0:1] +; MUBUF: s_swappc_b64 s[30:31], s[4:5] + +; GCN-NOT: s33 + ; GCN: ;;#ASMSTART ; GCN-NEXT: ; use s33 ; GCN-NEXT: ;;#ASMEND -; GCN-NOT: s33 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 { %s33 = call i32 asm sideeffect "; def $0", "={s33}"() @@ -168,20 +174,20 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace( ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 -; GCN: s_mov_b32 s32, 0 - -; GCN-NOT: s34 -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def s34 -; GCN-NEXT: ;;#ASMEND ; FLATSCR: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; MUBUF: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; GCN: s_mov_b32 s32, 0 + +; GCN: ;;#ASMSTART +; GCN-NEXT: ; def s34 +; GCN-NEXT: ;;#ASMEND ; GCN-NOT: s34 + ; MUBUF: s_swappc_b64 s[30:31], s[4:5] ; FLATSCR: s_swappc_b64 s[30:31], s[0:1] @@ -200,19 +206,19 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(ptr addrspace( ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}} -; GCN-NOT: v32 -; GCN: s_mov_b32 s32, 0 -; GCN-NOT: v40 - -; GCN: ;;#ASMSTART -; GCN-NEXT: ; def v40 -; GCN-NEXT: ;;#ASMEND ; MUBUF: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 ; FLATSCR: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; GCN: s_mov_b32 s32, 0 + +; GCN: ;;#ASMSTART +; GCN-NEXT: ; def v40 +; GCN-NEXT: ;;#ASMEND + +; GCN-NOT: v40 ; MUBUF: s_swappc_b64 s[30:31], s[4:5] ; FLATSCR: s_swappc_b64 s[30:31], s[0:1] @@ -255,10 +261,10 @@ define hidden void @void_func_void_clobber_s34() #2 { } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_mov_b32 s32, 0 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { @@ -267,10 +273,10 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_mov_b32 s32, 0 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll index 3626b2b..093ca55 100644 --- a/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll +++ b/llvm/test/CodeGen/AMDGPU/call-reqd-group-size.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s -; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck %s -check-prefix=GISEL ; Check for optimizing the passed implicit workitem ID based on the ; required group size. This should avoid a few bit packing operations. @@ -13,15 +13,30 @@ define amdgpu_kernel void @known_x_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 -; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 -; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 20, v2 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm +; +; GISEL-LABEL: known_x_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: v_lshlrev_b32_e32 v0, 20, v2 +; GISEL-NEXT: v_lshl_or_b32 v31, v1, 10, v0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @callee() ret void } @@ -34,13 +49,27 @@ define amdgpu_kernel void @known_y_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 -; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; CHECK-NEXT: v_lshl_or_b32 v31, v2, 20, v0 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm +; +; GISEL-LABEL: known_y_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: v_lshl_or_b32 v31, v2, 20, v0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @callee() ret void } @@ -53,13 +82,27 @@ define amdgpu_kernel void @known_z_0(ptr addrspace(1) %out) !reqd_work_group_siz ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 -; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; CHECK-NEXT: v_lshl_or_b32 v31, v1, 10, v0 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm +; +; GISEL-LABEL: known_z_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: v_lshl_or_b32 v31, v1, 10, v0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @callee() ret void } @@ -72,13 +115,27 @@ define amdgpu_kernel void @known_yz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_mov_b32_e32 v31, v0 -; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v31, v0 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm +; +; GISEL-LABEL: known_yz_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v31, v0 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @callee() ret void } @@ -91,13 +148,27 @@ define amdgpu_kernel void @known_xz_0(ptr addrspace(1) %out) !reqd_work_group_si ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 -; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v31, 10, v1 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm +; +; GISEL-LABEL: known_xz_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: v_lshlrev_b32_e32 v31, 10, v1 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @callee() ret void } @@ -111,13 +182,27 @@ define amdgpu_kernel void @known_xyz_0(ptr addrspace(1) %out) !reqd_work_group_s ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: v_mov_b32_e32 v31, 0 -; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v31, 0 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CHECK-NEXT: s_endpgm +; +; GISEL-LABEL: known_xyz_0: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_add_u32 flat_scratch_lo, s12, s17 +; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 +; GISEL-NEXT: s_add_u32 s0, s0, s17 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: s_getpc_b64 s[4:5] +; GISEL-NEXT: s_add_u32 s4, s4, callee@rel32@lo+4 +; GISEL-NEXT: s_addc_u32 s5, s5, callee@rel32@hi+12 +; GISEL-NEXT: v_mov_b32_e32 v31, 0 +; GISEL-NEXT: s_mov_b32 s32, 0 +; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GISEL-NEXT: s_endpgm call void @callee() ret void } diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 10f0efe..1515240 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -13,11 +13,11 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 { ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: ds_read_b32 v0, v0 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, ptr addrspace(3) %ptr @@ -33,16 +33,16 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 { ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 ; GCN-NEXT: s_add_u32 s0, s0, s11 -; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_addc_u32 s1, s1, 0 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_getpc_b64 s[8:9] +; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_store_dword v0, v0, s[6:7] ; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 s32, 0 -; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: s_endpgm store i32 0, ptr addrspace(1) %ptr @@ -55,16 +55,16 @@ define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) # ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 ; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: global_store_dword v40, v40, s[34:35] @@ -78,16 +78,16 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) % ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11 -; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 +; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0 ; GCN-NEXT: s_add_u32 s0, s0, s11 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_getpc_b64 s[8:9] ; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v40, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: global_store_dword v40, v0, s[34:35] diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index 3241a76..4c6f2d2 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -125,12 +125,12 @@ define void @callee_with_stack_and_call() #0 { ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_getpc_b64 s[16:17] ; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 @@ -155,12 +155,12 @@ define void @callee_with_stack_and_call() #0 { ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 -; FLATSCR-NEXT: scratch_store_dword off, v0, s33 -; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 @@ -196,10 +196,10 @@ define void @callee_no_stack_with_call() #0 { ; MUBUF-NEXT: v_writelane_b32 v40, s16, 2 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 -; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[16:17] ; MUBUF-NEXT: s_add_u32 s16, s16, external_void_func_void@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 @@ -223,10 +223,10 @@ define void @callee_no_stack_with_call() #0 { ; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 -; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 @@ -1595,12 +1595,12 @@ define void @ipra_call_with_stack() #0 { ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_writelane_b32 v1, s30, 0 ; MUBUF-NEXT: v_mov_b32_e32 v0, 0 -; MUBUF-NEXT: v_writelane_b32 v1, s31, 1 -; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 -; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_getpc_b64 s[16:17] ; MUBUF-NEXT: s_add_u32 s16, s16, local_empty_func@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s17, s17, local_empty_func@rel32@hi+12 +; MUBUF-NEXT: v_writelane_b32 v1, s31, 1 +; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[16:17] ; MUBUF-NEXT: v_readlane_b32 s31, v1, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v1, 0 @@ -1623,12 +1623,12 @@ define void @ipra_call_with_stack() #0 { ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_writelane_b32 v1, s30, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 0 -; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1 -; FLATSCR-NEXT: scratch_store_dword off, v0, s33 -; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_getpc_b64 s[0:1] ; FLATSCR-NEXT: s_add_u32 s0, s0, local_empty_func@rel32@lo+4 ; FLATSCR-NEXT: s_addc_u32 s1, s1, local_empty_func@rel32@hi+12 +; FLATSCR-NEXT: v_writelane_b32 v1, s31, 1 +; FLATSCR-NEXT: scratch_store_dword off, v0, s33 +; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v1, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 6094253..5d4db90 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -198,11 +198,12 @@ define hidden void @use_workgroup_id_yz() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x: ; GCN-NOT: s6 -; GCN: s_mov_b32 s12, s6 -; GCN: s_mov_b32 s32, 0 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x@rel32@hi+12 +; GCN-NOT: s6 +; GCN: s_mov_b32 s12, s6 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll index c511f88..fc24041 100644 --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -69,20 +69,20 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b32 s13, s15 ; GFX803-NEXT: s_mov_b32 s12, s14 -; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 -; GFX803-NEXT: s_mov_b32 s14, s16 -; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_getpc_b64 s[18:19] ; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 ; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 +; GFX803-NEXT: s_mov_b32 s14, s16 +; GFX803-NEXT: s_mov_b32 s32, 0 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; @@ -91,17 +91,17 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 -; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX900-NEXT: s_mov_b32 s14, s16 -; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_getpc_b64 s[18:19] ; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 ; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 +; GFX900-NEXT: s_mov_b32 s14, s16 +; GFX900-NEXT: s_mov_b32 s32, 0 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; @@ -119,10 +119,10 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX1010-NEXT: s_mov_b32 s13, s15 ; GFX1010-NEXT: s_mov_b32 s12, s14 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_mov_b32 s14, s16 ; GFX1010-NEXT: s_getpc_b64 s[18:19] ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_mov_b32 s14, s16 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; @@ -132,14 +132,14 @@ define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { ; GFX1100-NEXT: s_mov_b32 s12, s13 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s32, 0 -; GFX1100-NEXT: s_getpc_b64 s[16:17] -; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm @@ -153,23 +153,23 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b32 s13, s15 ; GFX803-NEXT: s_mov_b32 s12, s14 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_getpc_b64 s[18:19] -; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; @@ -178,20 +178,20 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_getpc_b64 s[18:19] -; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; @@ -210,12 +210,12 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1010-NEXT: s_mov_b32 s13, s15 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b32 s12, s14 -; GFX1010-NEXT: s_mov_b32 s14, s16 -; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 -; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_getpc_b64 s[18:19] ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], 0 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; @@ -226,6 +226,9 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1100-NEXT: s_mov_b32 s12, s13 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 @@ -233,9 +236,6 @@ define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { ; GFX1100-NEXT: s_mov_b32 s32, 16 ; GFX1100-NEXT: scratch_store_b32 off, v1, off dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: s_getpc_b64 s[16:17] -; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm @@ -320,21 +320,21 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b32 s13, s15 ; GFX803-NEXT: s_mov_b32 s12, s14 +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_mov_b32 s32, 0 -; GFX803-NEXT: s_getpc_b64 s[18:19] -; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; @@ -343,18 +343,18 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_mov_b32 s33, 0 ; GFX900-NEXT: s_mov_b32 s32, 0 -; GFX900-NEXT: s_getpc_b64 s[18:19] -; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; @@ -373,10 +373,10 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX1010-NEXT: s_mov_b32 s13, s15 ; GFX1010-NEXT: s_mov_b32 s12, s14 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 -; GFX1010-NEXT: s_mov_b32 s14, s16 ; GFX1010-NEXT: s_getpc_b64 s[18:19] ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_mov_b32 s14, s16 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; @@ -386,15 +386,15 @@ define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { ; GFX1100-NEXT: s_mov_b32 s12, s13 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 ; GFX1100-NEXT: s_mov_b32 s14, s15 ; GFX1100-NEXT: s_mov_b32 s33, 0 ; GFX1100-NEXT: s_mov_b32 s32, 0 -; GFX1100-NEXT: s_getpc_b64 s[16:17] -; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm ; GFX1010-NEXT s_add_u32 s12, s12, s17 @@ -426,24 +426,24 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_add_i32 s12, s12, s17 ; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX803-NEXT: s_add_u32 s0, s0, s17 +; GFX803-NEXT: v_lshlrev_b32_e32 v1, 10, v1 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX803-NEXT: s_mov_b32 s33, 0 ; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GFX803-NEXT: s_addc_u32 s1, s1, 0 ; GFX803-NEXT: s_mov_b32 s13, s15 ; GFX803-NEXT: s_mov_b32 s12, s14 ; GFX803-NEXT: v_mov_b32_e32 v3, 0 +; GFX803-NEXT: s_getpc_b64 s[18:19] +; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: v_or_b32_e32 v31, v0, v2 ; GFX803-NEXT: s_mov_b32 s14, s16 ; GFX803-NEXT: s_movk_i32 s32, 0x400 ; GFX803-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX803-NEXT: s_waitcnt vmcnt(0) -; GFX803-NEXT: s_getpc_b64 s[18:19] -; GFX803-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX803-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX803-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX803-NEXT: s_endpgm ; @@ -452,21 +452,21 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GFX900-NEXT: s_add_u32 s0, s0, s17 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GFX900-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GFX900-NEXT: s_mov_b32 s33, 0 -; GFX900-NEXT: s_addc_u32 s1, s1, 0 ; GFX900-NEXT: s_mov_b32 s13, s15 ; GFX900-NEXT: s_mov_b32 s12, s14 ; GFX900-NEXT: v_mov_b32_e32 v3, 0 +; GFX900-NEXT: s_getpc_b64 s[18:19] +; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX900-NEXT: s_mov_b32 s14, s16 ; GFX900-NEXT: s_movk_i32 s32, 0x400 ; GFX900-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: s_getpc_b64 s[18:19] -; GFX900-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 -; GFX900-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 ; GFX900-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX900-NEXT: s_endpgm ; @@ -486,12 +486,12 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1010-NEXT: s_mov_b32 s13, s15 ; GFX1010-NEXT: v_or3_b32 v31, v0, v1, v2 ; GFX1010-NEXT: s_mov_b32 s12, s14 -; GFX1010-NEXT: s_mov_b32 s14, s16 -; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 -; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_getpc_b64 s[18:19] ; GFX1010-NEXT: s_add_u32 s18, s18, ex@rel32@lo+4 ; GFX1010-NEXT: s_addc_u32 s19, s19, ex@rel32@hi+12 +; GFX1010-NEXT: s_mov_b32 s14, s16 +; GFX1010-NEXT: buffer_store_dword v3, off, s[0:3], s33 +; GFX1010-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GFX1010-NEXT: s_endpgm ; @@ -503,6 +503,9 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1100-NEXT: s_mov_b32 s12, s13 ; GFX1100-NEXT: s_mov_b64 s[10:11], s[6:7] ; GFX1100-NEXT: s_mov_b64 s[8:9], s[4:5] +; GFX1100-NEXT: s_getpc_b64 s[16:17] +; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 +; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX1100-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX1100-NEXT: s_mov_b32 s13, s14 @@ -510,9 +513,6 @@ define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_add ; GFX1100-NEXT: s_mov_b32 s32, 16 ; GFX1100-NEXT: scratch_store_b32 off, v1, s33 dlc ; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1100-NEXT: s_getpc_b64 s[16:17] -; GFX1100-NEXT: s_add_u32 s16, s16, ex@rel32@lo+4 -; GFX1100-NEXT: s_addc_u32 s17, s17, ex@rel32@hi+12 ; GFX1100-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1100-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 9bef0b7..8b8e519 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -35,10 +35,10 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, func_v2f32@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 @@ -71,10 +71,10 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, func_v3f32@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 @@ -107,10 +107,10 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, func_v4f16@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 @@ -143,10 +143,10 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, func_struct@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 @@ -189,16 +189,16 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-NEXT: s_cbranch_vccnz .LBB4_2 ; GCN-NEXT: ; %bb.1: ; %if.else ; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func_v3i16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func_v3i16@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB4_3 ; GCN-NEXT: .LBB4_2: @@ -240,16 +240,16 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-NEXT: s_cbranch_vccnz .LBB5_2 ; GCN-NEXT: ; %bb.1: ; %if.else ; GCN-NEXT: s_add_u32 s8, s8, 8 +; GCN-NEXT: s_addc_u32 s9, s9, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 -; GCN-NEXT: s_addc_u32 s9, s9, 0 +; GCN-NEXT: s_getpc_b64 s[18:19] +; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 ; GCN-NEXT: s_mov_b32 s12, s14 ; GCN-NEXT: s_mov_b32 s13, s15 ; GCN-NEXT: s_mov_b32 s14, s16 -; GCN-NEXT: s_getpc_b64 s[18:19] -; GCN-NEXT: s_add_u32 s18, s18, func_v3f16@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s19, s19, func_v3f16@rel32@hi+12 ; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19] ; GCN-NEXT: s_branch .LBB5_3 ; GCN-NEXT: .LBB5_2: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 75eb6855..5aa9be6 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -2917,8 +2917,8 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo @@ -5754,8 +5754,8 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index a05e4a0..0655967 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -2917,8 +2917,8 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo @@ -5754,8 +5754,8 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory__am ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v5, v1 -; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX10-NEXT: v_mov_b32_e32 v4, v0 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v5 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll index 6684262..13884eb 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -19,12 +19,12 @@ define void @callee_with_stack_and_call() #0 { ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 -; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 -; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] ; SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 +; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 @@ -62,11 +62,11 @@ define void @callee_with_stack_and_call() #0 { ; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[4:5] ; NO-SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 -; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 -; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_getpc_b64 s[4:5] ; NO-SPILL-TO-VGPR-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; NO-SPILL-TO-VGPR-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+12 +; NO-SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 +; NO-SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) ; NO-SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 s[4:5], exec ; NO-SPILL-TO-VGPR-NEXT: s_mov_b64 exec, 1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll index fbe06b3..15be44a 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -1267,16 +1267,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: @@ -1487,16 +1487,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -2487,16 +2487,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: @@ -2737,16 +2737,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -4543,16 +4543,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: @@ -4793,16 +4793,16 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -5987,19 +5987,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: @@ -6446,19 +6446,19 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -7692,8 +7692,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 @@ -8122,16 +8122,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: @@ -8379,16 +8379,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -9217,8 +9217,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 @@ -9555,16 +9555,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: @@ -9812,16 +9812,16 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -10650,8 +10650,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 @@ -11565,8 +11565,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 @@ -13748,8 +13748,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index 7792422..a4410bb 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -3366,17 +3366,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: @@ -3806,17 +3806,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 @@ -6469,8 +6469,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 @@ -6914,17 +6914,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; ; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: @@ -7354,17 +7354,17 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau ; ; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -8642,8 +8642,8 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_max_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index cb3291d..68d7dcc 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -3366,17 +3366,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-NEXT: ; %bb.1: @@ -3806,17 +3806,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -5094,8 +5094,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 @@ -6469,8 +6469,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_max_f64 v[5:6], v[5:6], v[5:6] ; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 @@ -6914,17 +6914,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; ; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-NEXT: ; %bb.1: @@ -7354,17 +7354,17 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau ; ; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe: ; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -8642,8 +8642,8 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_max_f64 v[10:11], v[10:11], v[10:11] ; GFX1032-DPP-NEXT: v_min_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll index 6dc3a197..7126680 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -1379,16 +1379,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-NEXT: ; %bb.1: @@ -1629,16 +1629,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -2711,16 +2711,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-NEXT: ; %bb.1: @@ -2961,16 +2961,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_ ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -4871,16 +4871,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-NEXT: ; %bb.1: @@ -5121,16 +5121,16 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop ; ; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -6315,19 +6315,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s42, s9 ; GFX1032-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s50, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-NEXT: s_mov_b32 s46, 0 ; GFX1032-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-NEXT: ; %bb.1: @@ -6774,19 +6774,19 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s42, s9 ; GFX1032-DPP-NEXT: s_mov_b32 s9, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s9, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0 ; GFX1032-DPP-NEXT: s_mov_b64 s[40:41], s[0:1] ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -8020,8 +8020,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 @@ -8450,16 +8450,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-NEXT: ; %bb.1: @@ -8707,16 +8707,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -9544,8 +9544,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 @@ -9882,16 +9882,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-NEXT: s_mov_b32 s14, -1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-NEXT: s_mov_b32 s2, 0 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-NEXT: ; %bb.1: @@ -10139,16 +10139,16 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent ; ; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp: ; GFX1032-DPP: ; %bb.0: -; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1 -; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s11 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 -; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3 ; GFX1032-DPP-NEXT: ; %bb.1: @@ -10977,8 +10977,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 @@ -11892,8 +11892,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v3, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v7, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 @@ -14074,8 +14074,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v8, 0, 0 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11] ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0 -; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9 ; GFX1032-DPP-NEXT: s_mov_b32 s46, 0 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index c826980..acb706c 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -54,6 +54,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s33, s16 ; CHECK-NEXT: s_addc_u32 s45, s35, 0 ; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: s_mov_b32 s13, s15 @@ -62,14 +65,14 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b64 s[36:37], s[10:11] ; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7] ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v45, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v43, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] @@ -77,13 +80,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v41, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] @@ -92,26 +95,23 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v45, v45 offset:15360 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v43 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v43 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] -; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: global_load_dword v0, v0, s[52:53] +; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 ; CHECK-NEXT: v_mov_b32_e32 v1, 12 @@ -190,6 +190,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -197,9 +200,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 @@ -215,6 +215,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -223,9 +226,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 1, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 @@ -241,6 +241,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -249,9 +252,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v60, 2, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v60 @@ -267,6 +267,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -275,9 +278,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v58, 3, v58 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v58 @@ -319,6 +319,9 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] @@ -326,9 +329,6 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v47, 1, v47 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v57 @@ -356,15 +356,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_mov_b32 s4, exec_lo @@ -381,15 +381,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z14get_local_sizej@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z14get_local_sizej@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41 ; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41 @@ -439,16 +439,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_and_b32_e32 v0, 0xf0, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 15, v1 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 -; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_addPU3AS1Vjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_addPU3AS1Vjj@rel32@hi+12 ; CHECK-NEXT: v_or3_b32 v2, v3, v2, v4 +; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 -; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_or3_b32 v73, v2, v0, v1 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_addPU3AS1Vjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_addPU3AS1Vjj@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v73 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v73 ; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffc, v0 @@ -500,15 +500,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: v_mov_b32_e32 v2, v44 ; CHECK-NEXT: s_add_u32 s8, s34, 40 ; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39] ; CHECK-NEXT: s_mov_b64 s[10:11], s[36:37] ; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_subPU3AS1Vjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_subPU3AS1Vjj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_branch .LBB0_27 ; CHECK-NEXT: .LBB0_33: @@ -803,6 +803,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s33, s16 ; CHECK-NEXT: s_addc_u32 s45, s39, 0 ; CHECK-NEXT: s_mov_b32 s43, s14 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: s_mov_b32 s13, s15 @@ -811,14 +814,14 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11] ; CHECK-NEXT: s_mov_b64 s[36:37], s[6:7] ; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5] -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z13get_global_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z13get_global_idj@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v43, 0 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v42, v0 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] @@ -826,13 +829,13 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z12get_local_idj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z12get_local_idj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mul_lo_u32 v46, v0, 14 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 ; CHECK-NEXT: v_mov_b32_e32 v0, 1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] @@ -841,27 +844,24 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: ds_write_b32 v43, v43 offset:15360 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: v_add_nc_u32_e32 v44, 0x3c04, v46 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v42 ; CHECK-NEXT: v_lshlrev_b32_e32 v1, 2, v42 ; CHECK-NEXT: v_mov_b32_e32 v31, v40 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] -; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: v_and_b32_e32 v0, 0x7ffffffc, v0 ; CHECK-NEXT: v_and_b32_e32 v1, 28, v1 +; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[8:9], s[44:45] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] -; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: global_load_dword v0, v0, s[46:47] +; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z3minjj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z3minjj@rel32@hi+12 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_bfe_u32 v0, v0, v1, 4 ; CHECK-NEXT: v_mov_b32_e32 v1, 12 @@ -945,6 +945,9 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v0, 0x3c00 ; CHECK-NEXT: s_add_u32 s8, s38, 40 ; CHECK-NEXT: s_addc_u32 s9, s39, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -952,9 +955,6 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 ; CHECK-NEXT: v_add_nc_u32_e32 v43, 1, v43 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z10atomic_incPU3AS3Vj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z10atomic_incPU3AS3Vj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; CHECK-NEXT: ds_write_b32 v0, v47 @@ -982,15 +982,15 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: v_mov_b32_e32 v0, 1 ; CHECK-NEXT: s_add_u32 s8, s38, 40 ; CHECK-NEXT: s_addc_u32 s9, s39, 0 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] ; CHECK-NEXT: s_mov_b64 s[6:7], s[36:37] ; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35] ; CHECK-NEXT: s_mov_b32 s12, s43 ; CHECK-NEXT: s_mov_b32 s13, s42 ; CHECK-NEXT: s_mov_b32 s14, s33 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, _Z7barrierj@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, _Z7barrierj@rel32@hi+12 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_endpgm .5: diff --git a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll index 6f841c8..f60786c 100644 --- a/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll +++ b/llvm/test/CodeGen/AMDGPU/set-inactive-wwm-overwrite.ll @@ -4,8 +4,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) { ; GCN-LABEL: if_then: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 @@ -60,8 +60,8 @@ define amdgpu_cs void @if_then(ptr addrspace(8) inreg %input, ptr addrspace(8) i define amdgpu_cs void @if_else_vgpr_opt(ptr addrspace(8) inreg %input, ptr addrspace(8) inreg %output, <3 x i32> %LocalInvocationId) { ; GCN-LABEL: if_else_vgpr_opt: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GCN-NEXT: v_mov_b32_e32 v3, 0 ; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GCN-NEXT: ; %bb.1: ; %.bb0 ; GCN-NEXT: v_mov_b32_e32 v3, 1 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index cd7f0c62..b57adfe 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -236,10 +236,10 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt ; GCN-NEXT: v_writelane_b32 v40, s4, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12 +; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -881,6 +881,9 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 9 ; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 @@ -895,9 +898,6 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 { ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca0 = alloca [3 x i32], align 16, addrspace(5) @@ -925,6 +925,9 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 +; GCN-NEXT: s_getpc_b64 s[16:17] +; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 +; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 @@ -956,9 +959,6 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 ; GCN-NEXT: v_mov_b32_e32 v28, 0 ; GCN-NEXT: v_mov_b32_e32 v29, 0 ; GCN-NEXT: v_mov_b32_e32 v30, 0 -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12 ; GCN-NEXT: s_setpc_b64 s[16:17] entry: %alloca = alloca [3 x i32], align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index 715ea57..0501602 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1023,8 +1023,8 @@ define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 { ; ; GFX10-WAVE32-LABEL: test_kill_divergent_loop: ; GFX10-WAVE32: ; %bb.0: ; %entry -; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-WAVE32-NEXT: s_mov_b32 s0, exec_lo ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB10_3 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index 5641c43..d4d3b37 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -21,10 +21,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 -; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4 ; MUBUF-NEXT: s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12 +; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v0, s0 ; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] @@ -85,10 +85,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; MUBUF11-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; MUBUF11-NEXT: v_mov_b32_e32 v4, 0x400000 -; MUBUF11-NEXT: s_movk_i32 s32, 0x6000 ; MUBUF11-NEXT: s_getpc_b64 s[0:1] ; MUBUF11-NEXT: s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4 ; MUBUF11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12 +; MUBUF11-NEXT: s_movk_i32 s32, 0x6000 ; MUBUF11-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF11-NEXT: v_mov_b32_e32 v0, s2 ; MUBUF11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -112,10 +112,10 @@ define amdgpu_kernel void @kernel_background_evaluate(ptr addrspace(5) %kg, ptr ; FLATSCR11-NEXT: v_mov_b32_e32 v1, 0x2000 ; FLATSCR11-NEXT: v_dual_mov_b32 v2, 0x4000 :: v_dual_mov_b32 v3, 0 ; FLATSCR11-NEXT: v_mov_b32_e32 v4, 0x400000 -; FLATSCR11-NEXT: s_movk_i32 s32, 0x6000 ; FLATSCR11-NEXT: s_getpc_b64 s[0:1] ; FLATSCR11-NEXT: s_add_u32 s0, s0, svm_eval_nodes@rel32@lo+4 ; FLATSCR11-NEXT: s_addc_u32 s1, s1, svm_eval_nodes@rel32@hi+12 +; FLATSCR11-NEXT: s_movk_i32 s32, 0x6000 ; FLATSCR11-NEXT: s_waitcnt lgkmcnt(0) ; FLATSCR11-NEXT: v_mov_b32_e32 v0, s2 ; FLATSCR11-NEXT: s_swappc_b64 s[30:31], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/swdev373493.ll b/llvm/test/CodeGen/AMDGPU/swdev373493.ll index 4f33e19..5917522 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev373493.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev373493.ll @@ -25,17 +25,20 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a ; CHECK-NEXT: s_getpc_b64 s[18:19] ; CHECK-NEXT: s_add_u32 s18, s18, global@rel32@lo+1948 ; CHECK-NEXT: s_addc_u32 s19, s19, global@rel32@hi+1956 -; CHECK-NEXT: v_mov_b32_e32 v5, 0 -; CHECK-NEXT: v_mov_b32_e32 v0, s18 -; CHECK-NEXT: v_mov_b32_e32 v1, s19 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, eggs@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, eggs@rel32@hi+12 +; CHECK-NEXT: v_mov_b32_e32 v5, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s18 +; CHECK-NEXT: v_mov_b32_e32 v1, s19 ; CHECK-NEXT: s_setpc_b64 s[16:17] ; CHECK-NEXT: .LBB0_3: ; %LeafBlock1 ; CHECK-NEXT: s_cbranch_scc0 .LBB0_5 ; CHECK-NEXT: ; %bb.4: ; %bb8 ; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: s_getpc_b64 s[16:17] +; CHECK-NEXT: s_add_u32 s16, s16, quux@rel32@lo+4 +; CHECK-NEXT: s_addc_u32 s17, s17, quux@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v1, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v6 ; CHECK-NEXT: v_mov_b32_e32 v3, v7 @@ -47,9 +50,6 @@ define hidden fastcc void @bar(i32 %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %a ; CHECK-NEXT: v_mov_b32_e32 v9, v13 ; CHECK-NEXT: v_mov_b32_e32 v10, v14 ; CHECK-NEXT: v_mov_b32_e32 v11, v15 -; CHECK-NEXT: s_getpc_b64 s[16:17] -; CHECK-NEXT: s_add_u32 s16, s16, quux@rel32@lo+4 -; CHECK-NEXT: s_addc_u32 s17, s17, quux@rel32@hi+12 ; CHECK-NEXT: s_setpc_b64 s[16:17] ; CHECK-NEXT: .LBB0_5: ; %bb9 ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll index 0689c05..80dae91 100644 --- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll +++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll @@ -7,10 +7,10 @@ define void @tail_call_i32_inreg_uniform(i32 inreg %sgpr) { ; CHECK-LABEL: tail_call_i32_inreg_uniform: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s0, s16 ; CHECK-NEXT: s_getpc_b64 s[18:19] ; CHECK-NEXT: s_add_u32 s18, s18, void_func_i32_inreg@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s19, s19, void_func_i32_inreg@rel32@hi+12 +; CHECK-NEXT: s_mov_b32 s0, s16 ; CHECK-NEXT: s_setpc_b64 s[18:19] tail call void @void_func_i32_inreg(i32 inreg %sgpr) ret void diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index 12eec4f..dd78c2f 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -290,6 +290,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.31: ; %bb7.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -297,9 +300,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] -; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 -; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: .LBB1_32: ; %Flow @@ -308,6 +308,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.33: ; %bb11.i.i ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -315,9 +318,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] -; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 -; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS1-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock ; @@ -582,6 +582,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.31: ; %bb7.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -589,9 +592,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] -; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 -; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: .LBB1_32: ; %Flow @@ -600,6 +600,9 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.33: ; %bb11.i.i ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 +; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] +; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 +; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] ; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], s[36:37] ; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], s[34:35] @@ -607,9 +610,6 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 -; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] -; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 -; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GLOBALNESS0-NEXT: .LBB1_34: ; %UnifiedUnreachableBlock bb: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll index a0bce34..c0b56d0 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange.ll @@ -161,16 +161,16 @@ for.end: define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_func, ptr %extern_func2) #0 { ; SI-LABEL: loop: ; SI: ; %bb.0: ; %main_body +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 -; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 ; SI-NEXT: s_add_u32 s12, s12, s1 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v6 ; SI-NEXT: ; implicit-def: $vgpr1 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s6, exec_lo, s0 @@ -243,11 +243,11 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e ; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; SI-NEXT: s_mov_b32 s14, -1 ; SI-NEXT: v_mov_b32_e32 v40, v1 +; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: s_mov_b32 s15, 0x31c16000 ; SI-NEXT: s_add_u32 s12, s12, s1 ; SI-NEXT: s_addc_u32 s13, s13, 0 ; SI-NEXT: s_mov_b32 s32, 0 -; SI-NEXT: v_cmp_gt_i32_e32 vcc_lo, 6, v0 ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: s_and_saveexec_b32 s0, vcc_lo ; SI-NEXT: s_xor_b32 s6, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 0307472..e49dd9e 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -372,8 +372,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 { ; GFX1032-NEXT: .LBB10_2: ; %bb2 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0 ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0 ; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB10_4 ; GFX1032-NEXT: ; %bb.3: ; %bb5 @@ -515,8 +515,8 @@ bb13: define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #0 { ; GFX1032-LABEL: test_loop_with_if_else_break: ; GFX1032: ; %bb.0: ; %bb -; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_6 ; GFX1032-NEXT: ; %bb.1: ; %.preheader diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll index 2e59a36..af7d169 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -416,10 +416,10 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg ; GFX9-O3-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-O3-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O3-NEXT: v_cndmask_b32_e64 v2, 0, v0, s[34:35] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: s_getpc_b64 s[36:37] ; GFX9-O3-NEXT: s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s37, s37, strict_wwm_called@rel32@hi+12 +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v2 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX9-O3-NEXT: v_mov_b32_e32 v1, v0 ; GFX9-O3-NEXT: v_add_u32_e32 v1, v1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 08cc2e4..ddc50b7 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -426,12 +426,12 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) { ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_getpc_b64 s[22:23] ; GFX9-O3-NEXT: s_add_u32 s22, s22, called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s23, s23, called@rel32@hi+12 +; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 @@ -1278,12 +1278,12 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in ; GFX9-O3-NEXT: s_mov_b64 s[4:5], s[0:1] ; GFX9-O3-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX9-O3-NEXT: s_mov_b64 s[0:1], s[24:25] -; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 -; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] -; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_getpc_b64 s[22:23] ; GFX9-O3-NEXT: s_add_u32 s22, s22, strict_wwm_called@rel32@lo+4 ; GFX9-O3-NEXT: s_addc_u32 s23, s23, strict_wwm_called@rel32@hi+12 +; GFX9-O3-NEXT: v_mov_b32_e32 v31, v3 +; GFX9-O3-NEXT: s_mov_b64 s[2:3], s[26:27] +; GFX9-O3-NEXT: v_mov_b32_e32 v0, v6 ; GFX9-O3-NEXT: s_swappc_b64 s[30:31], s[22:23] ; GFX9-O3-NEXT: v_mov_b32_e32 v3, v0 ; GFX9-O3-NEXT: v_add_u32_e32 v3, v3, v6 diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll index f367a56..b92f03d 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping-linux.ll @@ -41,8 +41,8 @@ define fastcc ptr @wrongUseOfPostDominate(ptr readonly %s, i32 %off, ptr readnon ; ENABLE-NEXT: bhs .LBB0_6 ; ENABLE-NEXT: @ %bb.5: @ %while.body ; ENABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; ENABLE-NEXT: cmp r0, r2 ; ENABLE-NEXT: mov r1, r3 +; ENABLE-NEXT: cmp r0, r2 ; ENABLE-NEXT: blo .LBB0_4 ; ENABLE-NEXT: .LBB0_6: @ %if.end29 ; ENABLE-NEXT: pop {r11, pc} @@ -131,8 +131,8 @@ define fastcc ptr @wrongUseOfPostDominate(ptr readonly %s, i32 %off, ptr readnon ; DISABLE-NEXT: bhs .LBB0_6 ; DISABLE-NEXT: @ %bb.5: @ %while.body ; DISABLE-NEXT: @ in Loop: Header=BB0_4 Depth=1 -; DISABLE-NEXT: cmp r0, r2 ; DISABLE-NEXT: mov r1, r3 +; DISABLE-NEXT: cmp r0, r2 ; DISABLE-NEXT: blo .LBB0_4 ; DISABLE-NEXT: .LBB0_6: @ %if.end29 ; DISABLE-NEXT: pop {r11, pc} diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll index 7541647..5b2d0a8 100644 --- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll +++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll @@ -2017,8 +2017,8 @@ define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %t ; ARM-DISABLE-NEXT: sub r4, sp, #24 ; ARM-DISABLE-NEXT: bfc r4, #0, #4 ; ARM-DISABLE-NEXT: mov sp, r4 -; ARM-DISABLE-NEXT: tst r2, #1 ; ARM-DISABLE-NEXT: vst1.64 {d8, d9}, [r4:128] +; ARM-DISABLE-NEXT: tst r2, #1 ; ARM-DISABLE-NEXT: vstr d10, [r4, #16] ; ARM-DISABLE-NEXT: beq LBB12_2 ; ARM-DISABLE-NEXT: @ %bb.1: @ %bb3 @@ -2123,8 +2123,8 @@ define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %t ; THUMB-DISABLE-NEXT: sub.w r4, sp, #24 ; THUMB-DISABLE-NEXT: bfc r4, #0, #4 ; THUMB-DISABLE-NEXT: mov sp, r4 -; THUMB-DISABLE-NEXT: lsls r1, r2, #31 ; THUMB-DISABLE-NEXT: vst1.64 {d8, d9}, [r4:128] +; THUMB-DISABLE-NEXT: lsls r1, r2, #31 ; THUMB-DISABLE-NEXT: vstr d10, [r4, #16] ; THUMB-DISABLE-NEXT: beq LBB12_2 ; THUMB-DISABLE-NEXT: @ %bb.1: @ %bb3 diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll index 7bc7b84..920742f 100644 --- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll @@ -50,9 +50,9 @@ define void @test_pr22678() { define <4 x i32> @test_vmovrrd_combine() nounwind { ; CHECK-LABEL: test_vmovrrd_combine: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ implicit-def: $q8 ; CHECK-NEXT: mov r0, #0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: @ implicit-def: $q8 ; CHECK-NEXT: bne .LBB3_2 ; CHECK-NEXT: @ %bb.1: @ %bb1.preheader ; CHECK-NEXT: vmov.i32 q8, #0x0 diff --git a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll index ac9641f..c36b3bf 100644 --- a/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll +++ b/llvm/test/CodeGen/PowerPC/p10-spill-creq.ll @@ -54,13 +54,13 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 { ; CHECK-NEXT: # implicit-def: $r4 ; CHECK-NEXT: .LBB0_8: # %bb20 ; CHECK-NEXT: mfcr r12 -; CHECK-NEXT: cmpwi cr2, r3, -1 ; CHECK-NEXT: cmpwi cr3, r4, -1 +; CHECK-NEXT: cmpwi cr2, r3, -1 ; CHECK-NEXT: stw r12, 8(r1) ; CHECK-NEXT: cmpwi cr7, r3, 0 ; CHECK-NEXT: cmpwi cr6, r4, 0 -; CHECK-NEXT: crand 4*cr5+gt, 4*cr2+gt, 4*cr1+lt ; CHECK-NEXT: crand 4*cr5+lt, 4*cr3+gt, 4*cr5+un +; CHECK-NEXT: crand 4*cr5+gt, 4*cr2+gt, 4*cr1+lt ; CHECK-NEXT: # implicit-def: $x3 ; CHECK-NEXT: bc 4, 4*cr5+gt, .LBB0_10 ; CHECK-NEXT: # %bb.9: # %bb34 @@ -95,15 +95,15 @@ define dso_local double @P10_Spill_CR_EQ(ptr %arg) local_unnamed_addr #0 { ; CHECK-NEXT: lwz r7, 0(r3) ; CHECK-NEXT: .LBB0_18: # %bb58 ; CHECK-NEXT: lwz r6, 92(r6) +; CHECK-NEXT: cmpwi cr4, r7, 1 ; CHECK-NEXT: crand 4*cr7+un, 4*cr3+gt, 4*cr6+un ; CHECK-NEXT: cmpwi cr3, r5, 1 -; CHECK-NEXT: cmpwi cr4, r7, 1 ; CHECK-NEXT: crand 4*cr7+gt, 4*cr7+eq, 4*cr1+lt ; CHECK-NEXT: # implicit-def: $x5 ; CHECK-NEXT: crand 4*cr6+un, 4*cr2+eq, 4*cr6+un ; CHECK-NEXT: crand 4*cr5+un, 4*cr6+eq, 4*cr5+un -; CHECK-NEXT: crand 4*cr6+gt, 4*cr3+lt, 4*cr6+gt ; CHECK-NEXT: crand 4*cr7+lt, 4*cr4+lt, 4*cr7+lt +; CHECK-NEXT: crand 4*cr6+gt, 4*cr3+lt, 4*cr6+gt ; CHECK-NEXT: cmpwi r6, 1 ; CHECK-NEXT: crand 4*cr6+lt, lt, 4*cr6+lt ; CHECK-NEXT: bc 4, 4*cr6+gt, .LBB0_20 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll index 6cb9855..ca1f103 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -24,8 +24,8 @@ define arm_aapcs_vfpcc void @fast_float_mul(ptr nocapture %a, ptr nocapture read ; CHECK-NEXT: cmpeq.w r12, #0 ; CHECK-NEXT: beq .LBB0_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r4, r3, #1 ; CHECK-NEXT: and r12, r3, #3 +; CHECK-NEXT: subs r4, r3, #1 ; CHECK-NEXT: cmp r4, #3 ; CHECK-NEXT: bhs .LBB0_6 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll index 44cbd7d..9c36bae 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/minloop.ll @@ -10,8 +10,8 @@ define void @arm_min_q31(ptr nocapture readonly %pSrc, i32 %blockSize, ptr nocap ; CHECK-NEXT: subs.w r9, r1, #1 ; CHECK-NEXT: beq .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %while.body.preheader -; CHECK-NEXT: subs r7, r1, #2 ; CHECK-NEXT: and r8, r9, #3 +; CHECK-NEXT: subs r7, r1, #2 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll index c0bc34c..7c6c7e9 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll @@ -1411,8 +1411,8 @@ define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocap ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cbz r2, .LBB9_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r12, r2, #3 +; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB9_4 ; CHECK-NEXT: @ %bb.2: @@ -1566,8 +1566,8 @@ define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocap ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: cbz r2, .LBB10_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r12, r2, #3 +; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB10_4 ; CHECK-NEXT: @ %bb.2: @@ -1721,8 +1721,8 @@ define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: cbz r2, .LBB11_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: and r12, r2, #3 +; CHECK-NEXT: subs r3, r2, #1 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhs .LBB11_4 ; CHECK-NEXT: @ %bb.2: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll index 715f656..8a5a15a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -348,8 +348,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(ptr nocapture readonly ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB5_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB5_6 ; CHECK-NEXT: @ %bb.3: @@ -624,8 +624,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(ptr nocapture readonl ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB7_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB7_6 ; CHECK-NEXT: @ %bb.3: @@ -900,8 +900,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(ptr nocapture readonly ; CHECK-NEXT: cmpeq r7, #0 ; CHECK-NEXT: beq .LBB9_4 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader -; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: and r12, r4, #3 +; CHECK-NEXT: subs r7, r4, #1 ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB9_6 ; CHECK-NEXT: @ %bb.3: diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 3c4af10..6f986ce 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -446,8 +446,8 @@ define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(ptr nocapture read ; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: vdup.32 q0, r3 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: add.w r3, r3, r7, lsr #2 ; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: add.w r3, r3, r7, lsr #2 ; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB6_5: @ %vector.body46 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 495ffe8..eb52b5a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -1060,12 +1060,12 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: vfma.f32 q0, q4, r5 ; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] ; CHECK-NEXT: vfma.f32 q0, q5, r6 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: vfma.f32 q0, q2, lr ; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] +; CHECK-NEXT: vfma.f32 q0, q2, lr +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vfma.f32 q0, q3, r11 -; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: vfma.f32 q0, q1, r8 +; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 @@ -1603,8 +1603,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly % ; CHECK-NEXT: .LBB19_3: @ %do.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB19_5 Depth 2 -; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: ldrd r5, r11, [r9] +; CHECK-NEXT: mov r6, r2 ; CHECK-NEXT: ldrd r8, r10, [r9, #8] ; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload ; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index e63c625..e8b49c1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1376,8 +1376,8 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture reado ; CHECK-NEXT: bne .LBB16_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: cmp r9, r1 ; CHECK-NEXT: bne .LBB16_2 ; CHECK-NEXT: .LBB16_5: @ %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll index 7b8b884..eedca2c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -712,8 +712,8 @@ define dso_local void @arm_mat_mult_q15(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 ; CHECK-NEXT: vaddv.u32 r10, q4 -; CHECK-NEXT: cmp r2, r12 ; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: cmp r2, r12 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll index 77980be..652d25a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll @@ -180,9 +180,9 @@ define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add.w r2, r9, r10 -; CHECK-NEXT: sub.w r5, r8, r9 ; CHECK-NEXT: add.w r7, r1, r9, lsl #1 ; CHECK-NEXT: add.w r2, r1, r2, lsl #1 +; CHECK-NEXT: sub.w r5, r8, r9 ; CHECK-NEXT: dlstp.32 lr, r5 ; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index 45bb70e..f90af3c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -258,11 +258,11 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 ; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: adds r0, r5, #2 ; CHECK-NEXT: adds r2, r5, #1 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: mov r3, r9 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r4, r10 ; CHECK-NEXT: vmov q2, q0 @@ -618,13 +618,13 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r0, #2 ; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: add.w r10, r0, #2 ; CHECK-NEXT: add.w r11, r0, #1 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r3, r8 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q2, q1 @@ -833,8 +833,8 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #2 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 +; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: mov r3, r8 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vmov q4, q1 @@ -1068,8 +1068,8 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: add.w r8, r0, #1 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r3, r9 ; CHECK-NEXT: vmov q4, q2 ; CHECK-NEXT: vmov q5, q2 @@ -1347,11 +1347,11 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: adds r4, r0, #3 ; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: adds r4, r0, #3 ; CHECK-NEXT: add.w r8, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: vmov q5, q3 ; CHECK-NEXT: vmov q6, q3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll index dd63b85..096d438 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-distribute.ll @@ -100,8 +100,8 @@ define void @arm_cmplx_dot_prod_q15(ptr nocapture readonly %pSrcA, ptr nocapture ; CHECK-NEXT: ldr.w r8, [sp, #36] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: mov r5, r7 -; CHECK-NEXT: and r2, r2, #3 ; CHECK-NEXT: lsrl r6, r5, #6 +; CHECK-NEXT: and r2, r2, #3 ; CHECK-NEXT: wls lr, r2, .LBB1_7 ; CHECK-NEXT: .LBB1_5: @ %while.body11 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll index c03339b..cba0f9c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -708,14 +708,14 @@ define ptr @signext(ptr %input_row, ptr %input_col, i16 zeroext %output_ch, i16 ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB5_4: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: ldr r1, [sp, #100] +; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: strb.w r0, [r1, r11] ; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r11, r0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body @@ -933,14 +933,14 @@ define ptr @signext_optsize(ptr %input_row, ptr %input_col, i16 zeroext %output_ ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup23 ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: ldr r1, [sp, #100] +; CHECK-NEXT: add.w r0, r8, r10 ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 ; CHECK-NEXT: strb.w r0, [r1, r11] ; CHECK-NEXT: add.w r11, r11, #1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: cmp r11, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll index 9400f24..723cbff 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -11,8 +11,8 @@ define void @arm_min_helium_f32(ptr %pSrc, i32 %blockSize, ptr nocapture %pResul ; CHECK-NEXT: vidup.u32 q2, r4, #1 ; CHECK-NEXT: movw r5, #54437 ; CHECK-NEXT: movt r5, #21352 -; CHECK-NEXT: vdup.32 q1, r5 ; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vdup.32 q1, r5 ; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll index 42a00b6..244a965 100644 --- a/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll +++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-nonzerostart.ll @@ -11,10 +11,10 @@ define arm_aapcs_vfpcc void @start12(ptr nocapture readonly %x, ptr nocapture re ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph ; CHECK-NEXT: vmov r12, s0 -; CHECK-NEXT: subs r3, #12 ; CHECK-NEXT: adds r0, #48 ; CHECK-NEXT: adds r1, #48 ; CHECK-NEXT: adds r2, #48 +; CHECK-NEXT: subs r3, #12 ; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll index 03b769f..7bb0c74 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-varargs-2.ll @@ -42,9 +42,9 @@ define hidden i32 @_Z1fiz(i32 %n, ...) local_unnamed_addr #0 { ; CHECK-NEXT: mov r4, r0 ; CHECK-NEXT: add r0, sp, #28 ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: cmp r4, #1 ; CHECK-NEXT: stm r0!, {r1, r2, r3} ; CHECK-NEXT: add r0, sp, #28 +; CHECK-NEXT: cmp r4, #1 ; CHECK-NEXT: str r0, [sp, #4] ; CHECK-NEXT: blt .LBB0_2 ; CHECK-NEXT: .LBB0_1: @ %for.body diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll index 5eb5990..e6fcf56 100644 --- a/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll +++ b/llvm/test/CodeGen/Thumb2/pacbti-m-vla.ll @@ -46,8 +46,8 @@ define hidden i32 @f(i32 %n) local_unnamed_addr #0 { ; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: blt .LBB0_3 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: subs r0, r5, #1 ; CHECK-NEXT: and r12, r5, #3 +; CHECK-NEXT: subs r0, r5, #1 ; CHECK-NEXT: cmp r0, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: diff --git a/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll b/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll index 4e700ce..e260286 100644 --- a/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll +++ b/llvm/test/CodeGen/Thumb2/setjmp_longjmp.ll @@ -58,8 +58,8 @@ define void @double_foobar() { ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: movs r0, #2 ; CHECK-NEXT: str r0, [r1] -; CHECK-NEXT: add r1, sp, #4 ; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: add r1, sp, #4 ; CHECK-NEXT: ldr r0, [r1, #8] ; CHECK-NEXT: mov sp, r0 ; CHECK-NEXT: ldr r0, [r1, #4] diff --git a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll index 1614de8..eb07677 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll @@ -23,9 +23,9 @@ define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind { ; A9-NEXT: add.w r4, lr, r2 ; A9-NEXT: ldr.w r6, [lr, r2] ; A9-NEXT: add r0, r3 -; A9-NEXT: adds r3, r4, r2 -; A9-NEXT: add r0, r12 ; A9-NEXT: ldr r5, [r4, r2] +; A9-NEXT: add r0, r12 +; A9-NEXT: adds r3, r4, r2 ; A9-NEXT: add r0, r6 ; A9-NEXT: add r3, r2 ; A9-NEXT: add r0, r5 |